Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failed analyses with subtasks marked as queued #1061

Open
sambles opened this issue Jun 4, 2024 · 1 comment
Open

Failed analyses with subtasks marked as queued #1061

sambles opened this issue Jun 4, 2024 · 1 comment
Assignees

Comments

@sambles
Copy link
Contributor

sambles commented Jun 4, 2024

Issue Description

If a failed analyses has subtasks marked as status QUEUED then the auto-scaler with not spin down to zero even though nothing is running.

logs

{
  "created": "2024-06-03T15:12:20.041454Z",
  "modified": "2024-06-03T15:18:22.728742Z",
  "name": "Analysis_03062024-161219",
  "id": 5,
  "portfolio": 5,
  "model": 2,
  "status": "RUN_ERROR",
  "run_mode": "V2",
  "task_started": "2024-06-03T15:13:04.173610Z",
  "task_finished": "2024-06-03T15:18:22.664475Z",
  "complex_model_data_files": [],
  "input_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/input_file/",
  "settings_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/settings_file/",
  "settings": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/settings/",
  "lookup_errors_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_errors_file/",
  "lookup_success_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_success_file/",
  "lookup_validation_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_validation_file/",
  "summary_levels_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/summary_levels_file/",
  "input_generation_traceback_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/input_generation_traceback_file/",
  "output_file": null,
  "run_traceback_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/run_traceback_file/",
  "run_log_file": null,
  "storage_links": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/storage_links/",
  "chunking_configuration": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/chunking_configuration/",
  "lookup_chunks": 5,
  "analysis_chunks": 10,
  "sub_task_count": 15,
  "groups": [],
  "sub_task_list": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/sub_task_list/",
  "sub_task_error_ids": [],
  "status_count": {
    "TOTAL_IN_QUEUE": 3,
    "TOTAL": 15,
    "PENDING": 0,
    "QUEUED": 3,
    "STARTED": 0,
    "COMPLETED": 12,
    "CANCELLED": 0,
    "ERROR": 0
  },
  "priority": 4
}
[
  {
    "id": 128,
    "task_id": "a603de05-6d93-4111-a029-37a88bb5e8fa",
    "status": "QUEUED",
    "queue_name": "OasisLMF-PiWind-2-v2",
    "name": "Generate losses output",
    "slug": "generate_losses_output",
    "pending_time": "2024-06-03T15:13:03.829714Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 129,
    "task_id": "23db7750-e291-4038-a816-817b78b09970",
    "status": "QUEUED",
    "queue_name": "celery-v2",
    "name": "Record losses files",
    "slug": "record-losses-files",
    "pending_time": "2024-06-03T15:13:03.829741Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 130,
    "task_id": "b6b8139a-3511-41a0-9f90-d84eeb51c892",
    "status": "QUEUED",
    "queue_name": "OasisLMF-PiWind-2-v2",
    "name": "Cleanup losses generation",
    "slug": "cleanup-losses-generation",
    "pending_time": "2024-06-03T15:13:03.829767Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  }
2024-06-04 11:39:00,360 DEBUG: Socket message: {'time': '2024-06-04T11:39:00.316702Z', 'type': 'queue_status.updated', 'status': 'SUCCESS', 'content': [{'queue': {'name': 'OasisLMF-PiWind-2-v2', 'pending_count': 0, 'queued_count': 2, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': [{'id': 2, 'supplier_id': 'OasisLMF', 'model_id': 'PiWind', 'version_id': '2', 'created': '2024-06-03T08:32:35.951484Z', 'modified': '2024-06-03T08:32:36.567678Z', 'data_files': [], 'settings': '/api/v2/models/2/settings/', 'versions': '/api/v2/models/2/versions/', 'scaling_configuration': '/api/v2/models/2/scaling_configuration/', 'chunking_configuration': '/api/v2/models/2/chunking_configuration/', 'groups': [], 'run_mode': 'V2'}]}, 'analyses': []}, {'queue': {'name': 'celery', 'pending_count': 0, 'queued_count': 0, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}, {'queue': {'name': 'celery-v2', 'pending_count': 0, 'queued_count': 1, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}, {'queue': {'name': 'task-controller', 'pending_count': 0, 'queued_count': 0, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}]}
2024-06-04 11:39:00,360 DEBUG: Analyses pending: {'pending-task_OasisLMF-PiWind-2-v2': {'id': None, 'tasks': 1, 'queue_names': ['OasisLMF-PiWind-2-v2'], 'priority': 4}}
2024-06-04 11:39:00,360 DEBUG: Analyses running: {}
2024-06-04 11:39:00,360 DEBUG: Model statuses: {'oasislmf-piwind-2-v2': {'tasks': 1, 'analyses': 1, 'priority': 4}, 'oasislmf-piwind-1-v1': {'tasks': 0, 'analyses': 0, 'priority': 1}}
2024-06-04 11:39:00,361 DEBUG: Scaling: [('oasislmf-piwind-2-v2', {'tasks': 1, 'analyses': 1, 'priority': 4}, <worker_deployments.WorkerDeployment object at 0x7fb7e0611450>)]
2024-06-04 11:39:00,568 DEBUG: Total desired number of workers: 1
@sambles
Copy link
Contributor Author

sambles commented Jun 5, 2024

Another example, the worker monitor-v2 task handle_task_failure needs to update other sub-task status on workflow error
Mark all other subtasks which are queue or pending as CANCELLED or ERROR

[2024-06-05 12:15:55,973: INFO/ForkPoolWorker-3] handle_task_failure[a5b2f784-3363-4c90-9d5d-80ea505dab04]: analysis_pk: 3, initiator_pk: 3, traceback: Traceback (most recent call last):
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 477, in trace_task
    R = retval = fun(*args, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 760, in __protected_call__
    return self.run(*args, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 60, in run
    ret = task.retry(exc=exc, **retry_kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/task.py", line 736, in retry
    raise_with_context(exc)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 38, in run
    return task._orig_run(*args, **kwargs)
  File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 447, in run
    return fn(self, params, *args, analysis_id=analysis_id, run_data_uuid=run_data_uuid, **kwargs)
  File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 508, in pre_analysis_hook
    params['pre_loc_file'] = filestore.put(
  File "/home/worker/.local/lib/python3.10/site-packages/oasis_data_manager/filestore/backends/base.py", line 318, in put
    self.fs.put(reference, storage_location)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/dirfs.py", line 184, in put
    return self.fs.put(
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/spec.py", line 1055, in put
    self.put_file(lpath, rpath, callback=child, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 138, in put_file
    return self.cp_file(path1, path2, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 124, in cp_file
    shutil.copyfile(path1, path2)
  File "/usr/lib/python3.10/shutil.py", line 256, in copyfile
    with open(dst, 'wb') as fdst:
FileNotFoundError: [Errno 2] No such file or directory: '/shared-fs/analysis-3_files-6dcb6b968c774f59b53522d4bd81b07b/location.csv'

[
  {
    "id": 15,
    "task_id": "c7f94103-0d34-48cc-b62e-8095cd356766",
    "status": "COMPLETED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Prepare input generation params",
    "slug": "prepare-input-generation-params",
    "pending_time": "2024-06-05T12:15:38.826860Z",
    "queue_time": null,
    "start_time": "2024-06-05T12:15:39.100452Z",
    "end_time": "2024-06-05T12:15:40.359411Z",
    "output_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/15/output_log/",
    "error_log": null
  },
  {
    "id": 16,
    "task_id": "b6934042-8424-4c08-933b-197afe5963f0",
    "status": "ERROR",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Pre analysis hook",
    "slug": "pre-analysis-hook",
    "pending_time": "2024-06-05T12:15:38.826986Z",
    "queue_time": null,
    "start_time": "2024-06-05T12:15:39.427706Z",
    "end_time": "2024-06-05T12:15:55.971324Z",
    "output_log": null,
    "error_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/16/error_log/"
  },
  {
    "id": 17,
    "task_id": "3f9364a9-a693-4f3b-b649-02bc336ac2de",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Prepare keys file 0",
    "slug": "prepare-keys-file-0",
    "pending_time": "2024-06-05T12:15:38.827020Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 18,
    "task_id": "d71f2573-dbd4-42b6-8885-4fa121b51b45",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Collect keys",
    "slug": "collect-keys",
    "pending_time": "2024-06-05T12:15:38.827050Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 19,
    "task_id": "4ff2f76d-97d0-4318-bca5-c53229962259",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Write input files",
    "slug": "write-input-files",
    "pending_time": "2024-06-05T12:15:38.827078Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 20,
    "task_id": "1f7355f4-0d9d-466e-a82d-62b009088566",
    "status": "QUEUED",
    "queue_name": "celery-v2",
    "name": "Record input files",
    "slug": "record-input-files",
    "pending_time": "2024-06-05T12:15:38.827106Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 21,
    "task_id": "21a40748-a820-40c5-a839-4d9a8ffaa4c4",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Cleanup input generation",
    "slug": "cleanup-input-generation",
    "pending_time": "2024-06-05T12:15:38.827136Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  }
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
Status: Todo
Development

No branches or pull requests

1 participant