Enhance tentacle.sh with GPU access validation including container state checks and improved logging
This commit is contained in:
+44
-4
@@ -301,15 +301,55 @@ docker run -d \
|
||||
# -----------------------------
|
||||
# Test GPU Access
|
||||
# -----------------------------
|
||||
sleep 5
|
||||
wait_for_container_running() {
|
||||
local name="$1"
|
||||
local timeout="${2:-60}"
|
||||
local elapsed=0
|
||||
local state=""
|
||||
|
||||
while [ "$elapsed" -lt "$timeout" ]; do
|
||||
state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)"
|
||||
|
||||
case "$state" in
|
||||
running)
|
||||
return 0
|
||||
;;
|
||||
restarting)
|
||||
sleep 2
|
||||
elapsed=$((elapsed + 2))
|
||||
;;
|
||||
stopped|paused|missing)
|
||||
break
|
||||
;;
|
||||
*)
|
||||
sleep 2
|
||||
elapsed=$((elapsed + 2))
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
err "Tentacle container did not reach a running state (last state: ${state})."
|
||||
docker logs "$name" || true
|
||||
return 1
|
||||
}
|
||||
|
||||
wait_for_container_running "${CONTAINER_NAME}" 60
|
||||
|
||||
if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then
|
||||
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
|
||||
err "GPU not accessible inside container."
|
||||
err "nvidia-smi exists in the container but failed to access the GPU."
|
||||
err "Check NVIDIA Container Toolkit installation."
|
||||
exit 1
|
||||
fi
|
||||
log "GPU access verified with nvidia-smi."
|
||||
elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then
|
||||
warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible."
|
||||
warn "The worker can still run, but GPU capability scoring will be unavailable."
|
||||
else
|
||||
err "GPU device nodes are not visible inside the container."
|
||||
err "Check NVIDIA Container Toolkit installation."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "GPU access verified."
|
||||
|
||||
|
||||
# -----------------------------
|
||||
|
||||
Reference in New Issue
Block a user