Enhance tentacle.sh with GPU access validation including container state checks and improved logging
This commit is contained in:
+46
-6
@@ -301,16 +301,56 @@ docker run -d \
|
|||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Test GPU Access
|
# Test GPU Access
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
sleep 5
|
wait_for_container_running() {
|
||||||
|
local name="$1"
|
||||||
|
local timeout="${2:-60}"
|
||||||
|
local elapsed=0
|
||||||
|
local state=""
|
||||||
|
|
||||||
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
|
while [ "$elapsed" -lt "$timeout" ]; do
|
||||||
err "GPU not accessible inside container."
|
state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)"
|
||||||
|
|
||||||
|
case "$state" in
|
||||||
|
running)
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
restarting)
|
||||||
|
sleep 2
|
||||||
|
elapsed=$((elapsed + 2))
|
||||||
|
;;
|
||||||
|
stopped|paused|missing)
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
sleep 2
|
||||||
|
elapsed=$((elapsed + 2))
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
err "Tentacle container did not reach a running state (last state: ${state})."
|
||||||
|
docker logs "$name" || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_container_running "${CONTAINER_NAME}" 60
|
||||||
|
|
||||||
|
if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then
|
||||||
|
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
|
||||||
|
err "nvidia-smi exists in the container but failed to access the GPU."
|
||||||
|
err "Check NVIDIA Container Toolkit installation."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "GPU access verified with nvidia-smi."
|
||||||
|
elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then
|
||||||
|
warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible."
|
||||||
|
warn "The worker can still run, but GPU capability scoring will be unavailable."
|
||||||
|
else
|
||||||
|
err "GPU device nodes are not visible inside the container."
|
||||||
err "Check NVIDIA Container Toolkit installation."
|
err "Check NVIDIA Container Toolkit installation."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "GPU access verified."
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Done
|
# Done
|
||||||
@@ -322,4 +362,4 @@ echo " Container: ${CONTAINER_NAME}"
|
|||||||
echo " Image: ${IMAGE}"
|
echo " Image: ${IMAGE}"
|
||||||
echo " Logs: docker logs -f ${CONTAINER_NAME}"
|
echo " Logs: docker logs -f ${CONTAINER_NAME}"
|
||||||
echo " Restart: systemctl restart docker"
|
echo " Restart: systemctl restart docker"
|
||||||
echo
|
echo
|
||||||
|
|||||||
Reference in New Issue
Block a user