Issue with gpu020

@Yann.Sagon
I am afraid that something is going on on gpu020 too

By running a openmm job that uses mpi and is on more nodes: (8cpu 8gpu in total)

46509730 shared-gp bromo_li karrenbr  R       5:17      3 gpu[009-010,020]

The processes on gpu020 came up with this and now the job is stuck, it doesn’t die but it doesn’t go on either (I killed it manually with scancel)
.
.

mlx5: gpu020.cluster: got completion with error:
00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000
00000017 00000000 00000000 00000000
00000000 00008a12 0a003e61 0001f6d2
mlx5: gpu020.cluster: got completion with error:
00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000
00000017 00000000 00000000 00000000
00000000 00008a12 0a003e5f 0001c8d2
[gpu020:200839:0:200839] rc_verbs_iface.c:63   FATAL: send completion with error: remote invalid request error
[gpu020:200840:0:200840] rc_verbs_iface.c:63   FATAL: send completion with error: remote invalid request error
mlx5: gpu020.cluster: got completion with error:
00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000
00000018 00000000 00000000 00000000
00000000 00008a12 0a003e60 0002fbd2
[gpu020:200838:0:200838] rc_verbs_iface.c:63   FATAL: send completion with error: remote invalid request error
==== backtrace ====
 0 0x0000000000033869 uct_rc_verbs_ep_t_delete()  ???:0
 1 0x0000000000017dba ucp_worker_progress()  ???:0
 2 0x00000000000039e7 mca_pml_ucx_progress()  ???:0
 3 0x000000000002fcac opal_progress()  ???:0
 4 0x0000000000036335 ompi_sync_wait_mt()  ???:0
 5 0x00000000000500f9 ompi_request_default_wait()  ???:0
 6 0x00000000000a1d53 ompi_coll_base_sendrecv_actual()  ???:0
 7 0x00000000000a0ae2 ompi_coll_base_allgather_intra_recursivedoubling()  ???:0
 8 0x00000000000629c3 MPI_Allgather()  ???:0
 9 0x00000000000c7613 __pyx_f_6mpi4py_3MPI_PyMPI_allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:54999
10 0x00000000000c7e9b __pyx_pf_6mpi4py_3MPI_4Comm_212allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118559
11 0x00000000000c7e9b __pyx_pw_6mpi4py_3MPI_4Comm_213allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118523
12 0x0000000000165914 _PyMethodDef_RawFastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:693
13 0x000000000016c7af _PyMethodDescr_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/descrobject.c:288
14 0x00000000001d1c7c call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4593
15 0x00000000001d1c7c _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
16 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
17 0x0000000000115925 _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:376
18 0x00000000001344d3 _PyObject_Call_Prepend()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:906
19 0x0000000000126ffe PyObject_Call()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:245
20 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
21 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
22 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
23 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
24 0x00000000001ce585 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
25 0x00000000001ce585 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3139
26 0x000000000011585b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
27 0x000000000011585b _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:322
28 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
29 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
30 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
31 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
32 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
33 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
34 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
35 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
36 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
37 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
38 0x0000000000164e7b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
39 0x0000000000164e7b _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:408
40 0x00000000001cd4b6 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
41 0x00000000001cd4b6 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3124
42 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
43 0x0000000000115714 PyEval_EvalCodeEx()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3959
44 0x000000000011573c PyEval_EvalCode()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:524
45 0x000000000022cf14 run_mod()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:1035
46 0x0000000000237331 PyRun_FileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:988
47 0x0000000000237523 PyRun_SimpleFileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:429
48 0x0000000000238655 pymain_run_file()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:462
49 0x0000000000238655 pymain_run_filename()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:1652
50 0x0000000000238655 pymain_run_python()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:2913
51 0x0000000000238655 pymain_main()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3460
52 0x000000000023877c _Py_UnixMain()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3495
53 0x0000000000022555 __libc_start_main()  ???:0
54 0x00000000001dcff0 _start()  /home/rdonnelly/mc/conda-bld/compilers_linux-64_1534865402226/work/.build/src/glibc-2.12.2/csu/../sysdeps/x86_64/elf/start.S:103
===================
==== backtrace ====
 0 0x0000000000033869 uct_rc_verbs_ep_t_delete()  ???:0
 1 0x0000000000017dba ucp_worker_progress()  ???:0
 2 0x00000000000039e7 mca_pml_ucx_progress()  ???:0
 3 0x000000000002fcac opal_progress()  ???:0
 4 0x0000000000036335 ompi_sync_wait_mt()  ???:0
 5 0x00000000000500f9 ompi_request_default_wait()  ???:0
 6 0x00000000000a1d53 ompi_coll_base_sendrecv_actual()  ???:0
 7 0x00000000000a0ae2 ompi_coll_base_allgather_intra_recursivedoubling()  ???:0
 8 0x00000000000629c3 MPI_Allgather()  ???:0
 9 0x00000000000c7613 __pyx_f_6mpi4py_3MPI_PyMPI_allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:54999
10 0x00000000000c7e9b __pyx_pf_6mpi4py_3MPI_4Comm_212allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118559
11 0x00000000000c7e9b __pyx_pw_6mpi4py_3MPI_4Comm_213allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118523
12 0x0000000000165914 _PyMethodDef_RawFastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:693
13 0x000000000016c7af _PyMethodDescr_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/descrobject.c:288
14 0x00000000001d1c7c call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4593
15 0x00000000001d1c7c _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
16 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
17 0x0000000000115925 _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:376
18 0x00000000001344d3 _PyObject_Call_Prepend()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:906
19 0x0000000000126ffe PyObject_Call()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:245
20 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
21 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
22 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
23 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
24 0x00000000001ce585 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
25 0x00000000001ce585 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3139
26 0x000000000011585b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
27 0x000000000011585b _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:322
28 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
29 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
30 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
31 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
32 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
33 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
34 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
35 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
36 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
37 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
38 0x0000000000164e7b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
39 0x0000000000164e7b _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:408
40 0x00000000001cd4b6 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
41 0x00000000001cd4b6 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3124
42 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
43 0x0000000000115714 PyEval_EvalCodeEx()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3959
44 0x000000000011573c PyEval_EvalCode()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:524
45 0x000000000022cf14 run_mod()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:1035
46 0x0000000000237331 PyRun_FileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:988
47 0x0000000000237523 PyRun_SimpleFileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:429
48 0x0000000000238655 pymain_run_file()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:462
49 0x0000000000238655 pymain_run_filename()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:1652
50 0x0000000000238655 pymain_run_python()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:2913
51 0x0000000000238655 pymain_main()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3460
52 0x000000000023877c _Py_UnixMain()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3495
53 0x0000000000022555 __libc_start_main()  ???:0
54 0x00000000001dcff0 _start()  /home/rdonnelly/mc/conda-bld/compilers_linux-64_1534865402226/work/.build/src/glibc-2.12.2/csu/../sysdeps/x86_64/elf/start.S:103
===================
==== backtrace ====
 0 0x0000000000033869 uct_rc_verbs_ep_t_delete()  ???:0
 1 0x0000000000017dba ucp_worker_progress()  ???:0
 2 0x00000000000039e7 mca_pml_ucx_progress()  ???:0
 3 0x000000000002fcac opal_progress()  ???:0
 4 0x0000000000036335 ompi_sync_wait_mt()  ???:0
 5 0x00000000000500f9 ompi_request_default_wait()  ???:0
 6 0x00000000000a1d53 ompi_coll_base_sendrecv_actual()  ???:0
 7 0x00000000000a0ae2 ompi_coll_base_allgather_intra_recursivedoubling()  ???:0
 8 0x00000000000629c3 MPI_Allgather()  ???:0
 9 0x00000000000c7613 __pyx_f_6mpi4py_3MPI_PyMPI_allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:54999
10 0x00000000000c7e9b __pyx_pf_6mpi4py_3MPI_4Comm_212allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118559
11 0x00000000000c7e9b __pyx_pw_6mpi4py_3MPI_4Comm_213allgather()  /home/users/k/karrenbr/mpy4py/mpi4py-3.0.3/src/mpi4py.MPI.c:118523
12 0x0000000000165914 _PyMethodDef_RawFastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:693
13 0x000000000016c7af _PyMethodDescr_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/descrobject.c:288
14 0x00000000001d1c7c call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4593
15 0x00000000001d1c7c _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
16 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
17 0x0000000000115925 _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:376
18 0x00000000001344d3 _PyObject_Call_Prepend()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:906
19 0x0000000000126ffe PyObject_Call()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:245
20 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
21 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
22 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
23 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
24 0x00000000001ce585 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
25 0x00000000001ce585 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3139
26 0x000000000011585b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
27 0x000000000011585b _PyFunction_FastCallDict()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:322
28 0x00000000001ceeea do_call_core()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4645
29 0x00000000001ceeea _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3191
30 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
31 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
32 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
33 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
34 0x0000000000115160 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
35 0x0000000000165107 _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:433
36 0x00000000001cd740 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
37 0x00000000001cd740 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3110
38 0x0000000000164e7b function_code_fastcall()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:283
39 0x0000000000164e7b _PyFunction_FastCallKeywords()  /tmp/build/80754af9/python_1598874792229/work/Objects/call.c:408
40 0x00000000001cd4b6 call_function()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:4616
41 0x00000000001cd4b6 _PyEval_EvalFrameDefault()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3124
42 0x0000000000114829 _PyEval_EvalCodeWithName()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3930
43 0x0000000000115714 PyEval_EvalCodeEx()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:3959
44 0x000000000011573c PyEval_EvalCode()  /tmp/build/80754af9/python_1598874792229/work/Python/ceval.c:524
45 0x000000000022cf14 run_mod()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:1035
46 0x0000000000237331 PyRun_FileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:988
47 0x0000000000237523 PyRun_SimpleFileExFlags()  /tmp/build/80754af9/python_1598874792229/work/Python/pythonrun.c:429
48 0x0000000000238655 pymain_run_file()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:462
49 0x0000000000238655 pymain_run_filename()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:1652
50 0x0000000000238655 pymain_run_python()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:2913
51 0x0000000000238655 pymain_main()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3460
52 0x000000000023877c _Py_UnixMain()  /tmp/build/80754af9/python_1598874792229/work/Modules/main.c:3495
53 0x0000000000022555 __libc_start_main()  ???:0
54 0x00000000001dcff0 _start()  /home/rdonnelly/mc/conda-bld/compilers_linux-64_1534865402226/work/.build/src/glibc-2.12.2/csu/../sysdeps/x86_64/elf/start.S:103
===================

.
.
.
ps gpu014 gpu012 gpu004 are down and gpu016 is drain

Hi, this seems like an infiniband + MPI error, maybe in your code? Remember that GPUs on gpu020 are split, maybe your job requires too much memory? Nvidia A100 Ampere architecture with MIG

1 Like

Hi, sorry for the late reply,

I don’t know how much videoRAM my application needs, but it might be that the application that I am running has some problems when dealing with split gpu. I don’t know

Is there a way to tell slurm to use shared-gpu without node gpu020?