diff --git a/tentacle.sh b/tentacle.sh index 404b2b2..5f301f5 100644 --- a/tentacle.sh +++ b/tentacle.sh @@ -301,16 +301,56 @@ docker run -d \ # ----------------------------- # Test GPU Access # ----------------------------- -sleep 5 +wait_for_container_running() { + local name="$1" + local timeout="${2:-60}" + local elapsed=0 + local state="" -if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then - err "GPU not accessible inside container." + while [ "$elapsed" -lt "$timeout" ]; do + state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)" + + case "$state" in + running) + return 0 + ;; + restarting) + sleep 2 + elapsed=$((elapsed + 2)) + ;; + stopped|paused|missing) + break + ;; + *) + sleep 2 + elapsed=$((elapsed + 2)) + ;; + esac + done + + err "Tentacle container did not reach a running state (last state: ${state})." + docker logs "$name" || true + return 1 +} + +wait_for_container_running "${CONTAINER_NAME}" 60 + +if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then + if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then + err "nvidia-smi exists in the container but failed to access the GPU." + err "Check NVIDIA Container Toolkit installation." + exit 1 + fi + log "GPU access verified with nvidia-smi." +elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then + warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible." + warn "The worker can still run, but GPU capability scoring will be unavailable." +else + err "GPU device nodes are not visible inside the container." err "Check NVIDIA Container Toolkit installation." exit 1 fi -log "GPU access verified." - # ----------------------------- # Done @@ -322,4 +362,4 @@ echo " Container: ${CONTAINER_NAME}" echo " Image: ${IMAGE}" echo " Logs: docker logs -f ${CONTAINER_NAME}" echo " Restart: systemctl restart docker" -echo \ No newline at end of file +echo