Enhance tentacle.sh with GPU access validation including container state checks and improved logging

This commit is contained in:
Evan Hosinski
2026-05-21 13:16:48 -04:00
parent 13f4a2f47f
commit d77253c8bf
+45 -5
View File
@@ -301,16 +301,56 @@ docker run -d \
# -----------------------------
# Test GPU Access
# -----------------------------
sleep 5
wait_for_container_running() {
local name="$1"
local timeout="${2:-60}"
local elapsed=0
local state=""
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
err "GPU not accessible inside container."
while [ "$elapsed" -lt "$timeout" ]; do
state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)"
case "$state" in
running)
return 0
;;
restarting)
sleep 2
elapsed=$((elapsed + 2))
;;
stopped|paused|missing)
break
;;
*)
sleep 2
elapsed=$((elapsed + 2))
;;
esac
done
err "Tentacle container did not reach a running state (last state: ${state})."
docker logs "$name" || true
return 1
}
wait_for_container_running "${CONTAINER_NAME}" 60
if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
err "nvidia-smi exists in the container but failed to access the GPU."
err "Check NVIDIA Container Toolkit installation."
exit 1
fi
log "GPU access verified with nvidia-smi."
elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then
warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible."
warn "The worker can still run, but GPU capability scoring will be unavailable."
else
err "GPU device nodes are not visible inside the container."
err "Check NVIDIA Container Toolkit installation."
exit 1
fi
log "GPU access verified."
# -----------------------------
# Done