Update tentacle.sh
This commit is contained in:
+325
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
IMAGE_TAG="${TENTACLE_TAG:-latest}"
|
||||
|
||||
# -----------------------------
|
||||
# Config
|
||||
# -----------------------------
|
||||
REGISTRY="hub.krkn.tech"
|
||||
IMAGE_BASE="hub.krkn.tech/krkncli/cuda"
|
||||
CONTAINER_NAME="tentacle"
|
||||
|
||||
SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0")
|
||||
|
||||
# -----------------------------
|
||||
# Helpers
|
||||
# -----------------------------
|
||||
log() { echo -e "\033[1;32m[*]\033[0m $*"; }
|
||||
warn() { echo -e "\033[1;33m[!]\033[0m $*"; }
|
||||
err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; }
|
||||
|
||||
# -----------------------------
|
||||
# Root check
|
||||
# -----------------------------
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
err "Run as root."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# OS check
|
||||
# -----------------------------
|
||||
if ! command -v apt >/dev/null; then
|
||||
err "This installer currently supports Debian/Ubuntu only."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# Ensure Docker
|
||||
# -----------------------------
|
||||
if ! command -v docker >/dev/null; then
|
||||
log "Installing Docker..."
|
||||
curl -fsSL https://get.docker.com | sh
|
||||
systemctl enable --now docker
|
||||
else
|
||||
log "Docker already installed."
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# Ensure NVIDIA driver
|
||||
# -----------------------------
|
||||
if ! command -v nvidia-smi >/dev/null; then
|
||||
err "NVIDIA driver not detected. Install driver first and reboot."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
nvidia-smi || true
|
||||
|
||||
# -----------------------------
|
||||
# Ensure NVIDIA container runtime
|
||||
# -----------------------------
|
||||
if ! command -v nvidia-ctk >/dev/null; then
|
||||
log "Installing NVIDIA container toolkit..."
|
||||
|
||||
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
rm -f /usr/share/keyrings/nvidia-toolkit.gpg
|
||||
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||
gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg
|
||||
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \
|
||||
> /etc/apt/sources.list.d/nvidia-toolkit.list
|
||||
|
||||
apt-get update
|
||||
apt-get install -y nvidia-container-toolkit
|
||||
|
||||
nvidia-ctk runtime configure --runtime=docker
|
||||
systemctl restart docker
|
||||
else
|
||||
log "NVIDIA container toolkit already installed."
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# Detect CUDA version
|
||||
# -----------------------------
|
||||
DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')"
|
||||
log "Detected CUDA capability: ${DETECTED_CUDA}"
|
||||
|
||||
# Extract major.minor (e.g., 12.4)
|
||||
DETECTED_MM="$(echo "$DETECTED_CUDA" | cut -d. -f1,2)"
|
||||
|
||||
choose_cuda() {
|
||||
local best=""
|
||||
local best_mm=""
|
||||
|
||||
for v in "${SUPPORTED_CUDA[@]}"; do
|
||||
v_mm="$(echo "$v" | cut -d. -f1,2)"
|
||||
|
||||
# Only consider versions with same major.minor
|
||||
if [ "$v_mm" = "$DETECTED_MM" ]; then
|
||||
# Pick highest patch of that minor
|
||||
if [ -z "$best" ] || [ "$(printf '%s\n%s\n' "$best" "$v" | sort -V | tail -n1)" = "$v" ]; then
|
||||
best="$v"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$best" ]; then
|
||||
echo "$best"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
if CUDA_VERSION="$(choose_cuda)"; then
|
||||
log "Selected CUDA image version: ${CUDA_VERSION}"
|
||||
else
|
||||
err "No Tentacle image for CUDA ${DETECTED_MM}.x"
|
||||
err "Driver reports CUDA ${DETECTED_CUDA}"
|
||||
err "Available images: ${SUPPORTED_CUDA[*]}"
|
||||
err "Install compatible NVIDIA driver or contact support."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}"
|
||||
|
||||
# -----------------------------
|
||||
# Registry login
|
||||
# -----------------------------
|
||||
docker logout "$REGISTRY" >/dev/null 2>&1 || true
|
||||
|
||||
REG_USER="krkn-registry"
|
||||
REG_PASS="${KRKN_REGISTRY_TOKEN:-}"
|
||||
|
||||
if [ -z "$REG_PASS" ]; then
|
||||
read -rsp "Registry access token: " REG_PASS < /dev/tty
|
||||
echo
|
||||
fi
|
||||
|
||||
echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || {
|
||||
err "Docker login failed. Invalid token or registry unreachable."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# -----------------------------
|
||||
# Prompt config
|
||||
# -----------------------------
|
||||
ORCH_ADDR="${KRKN_ORCH_ADDR:-}"
|
||||
|
||||
if [ -z "$ORCH_ADDR" ]; then
|
||||
echo
|
||||
read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty
|
||||
if [ -z "$ORCH_ADDR" ]; then
|
||||
err "Orchestrator address is required."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# -----------------------------
|
||||
# Normalize orchestrator address
|
||||
# -----------------------------
|
||||
|
||||
normalize_orch_addr() {
|
||||
local input="$1"
|
||||
local scheme=""
|
||||
local rest=""
|
||||
local hostport=""
|
||||
local path=""
|
||||
|
||||
# Extract scheme if present
|
||||
if [[ "$input" =~ ^(https?://)(.*)$ ]]; then
|
||||
scheme="${BASH_REMATCH[1]}"
|
||||
rest="${BASH_REMATCH[2]}"
|
||||
else
|
||||
rest="$input"
|
||||
fi
|
||||
|
||||
# Split host[:port] and path
|
||||
if [[ "$rest" =~ ^([^/]+)(/.*)?$ ]]; then
|
||||
hostport="${BASH_REMATCH[1]}"
|
||||
path="${BASH_REMATCH[2]:-}"
|
||||
else
|
||||
hostport="$rest"
|
||||
fi
|
||||
|
||||
# Split host and port
|
||||
local host="$hostport"
|
||||
local port=""
|
||||
|
||||
if [[ "$hostport" =~ ^(.+):([0-9]+)$ ]]; then
|
||||
host="${BASH_REMATCH[1]}"
|
||||
port="${BASH_REMATCH[2]}"
|
||||
fi
|
||||
|
||||
# Strip -krkn suffix from the hostname label only
|
||||
# e.g. krkn-krkn.example.com -> krkn.example.com
|
||||
# The conductor servers on the primary host at port 65535
|
||||
# The -krkn is for the krkns service not the orechstra conductor
|
||||
# foo-krkn -> foo
|
||||
host="$(echo "$host" | sed -E 's/-krkn(\.|$)/\1/')"
|
||||
|
||||
# If no port, add default
|
||||
if [ -z "$port" ]; then
|
||||
port="65535"
|
||||
fi
|
||||
|
||||
echo "${scheme}${host}:${port}${path}"
|
||||
}
|
||||
|
||||
ORCH_ADDR="$(normalize_orch_addr "$ORCH_ADDR")"
|
||||
|
||||
log "Normalized orchestrator address: ${ORCH_ADDR}"
|
||||
|
||||
|
||||
DEFAULT_WORKER_ID="$(hostname)"
|
||||
read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty
|
||||
WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}"
|
||||
|
||||
# -----------------------------
|
||||
# Pull image
|
||||
# -----------------------------
|
||||
log "Pulling image: ${IMAGE}"
|
||||
docker pull --quiet "${IMAGE}" || {
|
||||
err "Failed to pull image. Check token permissions or image availability."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# -----------------------------
|
||||
# Stop old container
|
||||
# -----------------------------
|
||||
cleanup_container() {
|
||||
local name="$1"
|
||||
|
||||
log "Cleaning up existing container: $name"
|
||||
|
||||
# If container doesn't exist, nothing to do
|
||||
if ! docker inspect "$name" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Try graceful stop first
|
||||
docker stop "$name" >/dev/null 2>&1 || true
|
||||
|
||||
# Try force remove
|
||||
docker rm -f "$name" >/dev/null 2>&1 || true
|
||||
|
||||
# Check if it's still there
|
||||
if docker inspect "$name" >/dev/null 2>&1; then
|
||||
warn "Docker could not remove $name, attempting manual kill..."
|
||||
|
||||
# Get PID
|
||||
local pid
|
||||
pid="$(docker inspect -f '{{.State.Pid}}' "$name" 2>/dev/null || echo "")"
|
||||
|
||||
if [[ -n "$pid" && "$pid" != "0" ]]; then
|
||||
warn "Killing stuck container process (PID=$pid)"
|
||||
kill -9 "$pid" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# Final remove attempt
|
||||
docker rm "$name" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
# Verify
|
||||
if docker inspect "$name" >/dev/null 2>&1; then
|
||||
err "Failed to remove container $name (manual intervention required)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_container "${CONTAINER_NAME}"
|
||||
|
||||
# -----------------------------
|
||||
# Create dirs
|
||||
# -----------------------------
|
||||
mkdir -p /tmp/tentacle
|
||||
mkdir -p /tmp/tentacle-logs
|
||||
|
||||
# -----------------------------
|
||||
# Run container
|
||||
# -----------------------------
|
||||
log "Starting Tentacle worker..."
|
||||
|
||||
docker run -d \
|
||||
--pull=always \
|
||||
--name "${CONTAINER_NAME}" \
|
||||
--network host \
|
||||
--runtime=nvidia \
|
||||
--gpus all \
|
||||
--restart unless-stopped \
|
||||
-e CONDUCTOR_ADDR="${ORCH_ADDR}" \
|
||||
-e WORKER_ID="${WORKER_ID}" \
|
||||
-v /tmp/tentacle:/tmp/tentacle \
|
||||
-v /tmp/tentacle-logs:/opt/tentacle/logs \
|
||||
"${IMAGE}"
|
||||
|
||||
# -----------------------------
|
||||
# Test GPU Access
|
||||
# -----------------------------
|
||||
sleep 5
|
||||
|
||||
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
|
||||
err "GPU not accessible inside container."
|
||||
err "Check NVIDIA Container Toolkit installation."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "GPU access verified."
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Done
|
||||
# -----------------------------
|
||||
log "Tentacle worker installed and running."
|
||||
|
||||
echo
|
||||
echo " Container: ${CONTAINER_NAME}"
|
||||
echo " Image: ${IMAGE}"
|
||||
echo " Logs: docker logs -f ${CONTAINER_NAME}"
|
||||
echo " Restart: systemctl restart docker"
|
||||
echo
|
||||
Reference in New Issue
Block a user