Zion Boggan
repos/GPU CPU Mutex/examples/remote_dispatch.py
zionboggan.com ↗
42 lines · python
History for this file →
1
"""Optional: enforce the same GPU mutex from inside a Python dispatcher.
2
 
3
If your GPU jobs are fired over SSH to a remote box (or from a long-running Python
4
loop rather than a shell), you can hold the *same* flock in-process so the wrapper
5
and the dispatcher share one lock. The reentrancy trick is identical to gpu_run.sh:
6
honor an inherited GPU_LOCK_HELD and skip re-locking, and respect a skip flag for
7
non-GPU work (builds, uploads) that happens to run on the same box.
8
 
9
This is a generic skeleton - replace the dispatch body with your own call.
10
"""
11
import fcntl
12
import os
13
import subprocess
14
import sys
15
 
16
GPU_LOCK_PATH = os.environ.get("GPU_LOCK_PATH", "/tmp/gpu.lock")
17
 
18
 
19
def gpu_dispatch(argv, is_gpu_job=True, timeout=600):
20
    need_lock = (
21
        is_gpu_job
22
        and not os.environ.get("GPU_LOCK_HELD")
23
        and not os.environ.get("GPU_SKIP_LOCK")
24
    )
25
    lockf = None
26
    try:
27
        if need_lock:
28
            lockf = open(GPU_LOCK_PATH, "w")
29
            sys.stderr.write("[dispatch] waiting for GPU lock...\n")
30
            fcntl.flock(lockf, fcntl.LOCK_EX)
31
            sys.stderr.write("[dispatch] acquired GPU lock\n")
32
            os.environ["GPU_LOCK_HELD"] = "1"
33
        return subprocess.run(argv, text=True, timeout=timeout).returncode
34
    finally:
35
        if lockf is not None:
36
            fcntl.flock(lockf, fcntl.LOCK_UN)
37
            lockf.close()
38
            os.environ.pop("GPU_LOCK_HELD", None)
39
 
40
 
41
if __name__ == "__main__":
42
    sys.exit(gpu_dispatch(sys.argv[1:]))