Files
Scripts/tentacle.sh
T

366 lines
9.2 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
IMAGE_TAG="${TENTACLE_TAG:-latest}"
# -----------------------------
# Config
# -----------------------------
REGISTRY="hub.krkn.tech"
IMAGE_BASE="hub.krkn.tech/krakentech/cuda"
CONTAINER_NAME="tentacle"
SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0")
# -----------------------------
# Helpers
# -----------------------------
log() { echo -e "\033[1;32m[*]\033[0m $*"; }
warn() { echo -e "\033[1;33m[!]\033[0m $*"; }
err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; }
# -----------------------------
# Root check
# -----------------------------
if [ "$(id -u)" -ne 0 ]; then
err "Run as root."
exit 1
fi
# -----------------------------
# OS check
# -----------------------------
if ! command -v apt >/dev/null; then
err "This installer currently supports Debian/Ubuntu only."
exit 1
fi
# -----------------------------
# Ensure Docker
# -----------------------------
if ! command -v docker >/dev/null; then
log "Installing Docker..."
curl -fsSL https://get.docker.com | sh
systemctl enable --now docker
else
log "Docker already installed."
fi
# -----------------------------
# Ensure NVIDIA driver
# -----------------------------
if ! command -v nvidia-smi >/dev/null; then
err "NVIDIA driver not detected. Install driver first and reboot."
exit 1
fi
nvidia-smi || true
# -----------------------------
# Ensure NVIDIA container runtime
# -----------------------------
if ! command -v nvidia-ctk >/dev/null; then
log "Installing NVIDIA container toolkit..."
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list
rm -f /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \
> /etc/apt/sources.list.d/nvidia-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
systemctl restart docker
else
log "NVIDIA container toolkit already installed."
fi
# -----------------------------
# Detect CUDA version
# -----------------------------
DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')"
log "Detected CUDA capability: ${DETECTED_CUDA}"
# Extract major.minor (e.g., 12.4)
DETECTED_MM="$(echo "$DETECTED_CUDA" | cut -d. -f1,2)"
choose_cuda() {
local best=""
local best_mm=""
for v in "${SUPPORTED_CUDA[@]}"; do
v_mm="$(echo "$v" | cut -d. -f1,2)"
# Only consider versions with same major.minor
if [ "$v_mm" = "$DETECTED_MM" ]; then
# Pick highest patch of that minor
if [ -z "$best" ] || [ "$(printf '%s\n%s\n' "$best" "$v" | sort -V | tail -n1)" = "$v" ]; then
best="$v"
fi
fi
done
if [ -n "$best" ]; then
echo "$best"
return 0
fi
return 1
}
if CUDA_VERSION="$(choose_cuda)"; then
log "Selected CUDA image version: ${CUDA_VERSION}"
else
err "No Tentacle image for CUDA ${DETECTED_MM}.x"
err "Driver reports CUDA ${DETECTED_CUDA}"
err "Available images: ${SUPPORTED_CUDA[*]}"
err "Install compatible NVIDIA driver or contact support."
exit 1
fi
IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}"
# -----------------------------
# Registry login
# -----------------------------
docker logout "$REGISTRY" >/dev/null 2>&1 || true
REG_USER="krkn-registry"
REG_PASS="${KRKN_REGISTRY_TOKEN:-}"
if [ -z "$REG_PASS" ]; then
read -rsp "Registry access token: " REG_PASS < /dev/tty
echo
fi
echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || {
err "Docker login failed. Invalid token or registry unreachable."
exit 1
}
# -----------------------------
# Prompt config
# -----------------------------
ORCH_ADDR="${KRKN_ORCH_ADDR:-}"
if [ -z "$ORCH_ADDR" ]; then
echo
read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty
if [ -z "$ORCH_ADDR" ]; then
err "Orchestrator address is required."
exit 1
fi
fi
# -----------------------------
# Normalize orchestrator address
# -----------------------------
normalize_orch_addr() {
local input="$1"
local scheme=""
local rest=""
local hostport=""
local path=""
# Extract scheme if present
if [[ "$input" =~ ^(https?://)(.*)$ ]]; then
scheme="${BASH_REMATCH[1]}"
rest="${BASH_REMATCH[2]}"
else
rest="$input"
fi
# Split host[:port] and path
if [[ "$rest" =~ ^([^/]+)(/.*)?$ ]]; then
hostport="${BASH_REMATCH[1]}"
path="${BASH_REMATCH[2]:-}"
else
hostport="$rest"
fi
# Split host and port
local host="$hostport"
local port=""
if [[ "$hostport" =~ ^(.+):([0-9]+)$ ]]; then
host="${BASH_REMATCH[1]}"
port="${BASH_REMATCH[2]}"
fi
# Strip -krkn suffix from the hostname label only
# e.g. krkn-krkn.example.com -> krkn.example.com
# The conductor servers on the primary host at port 65535
# The -krkn is for the krkns service not the orechstra conductor
# foo-krkn -> foo
host="$(echo "$host" | sed -E 's/-krkn(\.|$)/\1/')"
# If no port, add default
if [ -z "$port" ]; then
port="65535"
fi
echo "${scheme}${host}:${port}${path}"
}
ORCH_ADDR="$(normalize_orch_addr "$ORCH_ADDR")"
log "Normalized orchestrator address: ${ORCH_ADDR}"
DEFAULT_WORKER_ID="$(hostname)"
read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty
WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}"
# -----------------------------
# Pull image
# -----------------------------
log "Pulling image: ${IMAGE}"
docker pull --quiet "${IMAGE}" || {
err "Failed to pull image. Check token permissions or image availability."
exit 1
}
# -----------------------------
# Stop old container
# -----------------------------
cleanup_container() {
local name="$1"
log "Cleaning up existing container: $name"
# If container doesn't exist, nothing to do
if ! docker inspect "$name" >/dev/null 2>&1; then
return 0
fi
# Try graceful stop first
docker stop "$name" >/dev/null 2>&1 || true
# Try force remove
docker rm -f "$name" >/dev/null 2>&1 || true
# Check if it's still there
if docker inspect "$name" >/dev/null 2>&1; then
warn "Docker could not remove $name, attempting manual kill..."
# Get PID
local pid
pid="$(docker inspect -f '{{.State.Pid}}' "$name" 2>/dev/null || echo "")"
if [[ -n "$pid" && "$pid" != "0" ]]; then
warn "Killing stuck container process (PID=$pid)"
kill -9 "$pid" >/dev/null 2>&1 || true
sleep 1
fi
# Final remove attempt
docker rm "$name" >/dev/null 2>&1 || true
fi
# Verify
if docker inspect "$name" >/dev/null 2>&1; then
err "Failed to remove container $name (manual intervention required)"
exit 1
fi
}
cleanup_container "${CONTAINER_NAME}"
# -----------------------------
# Create dirs
# -----------------------------
mkdir -p /tmp/tentacle
mkdir -p /tmp/tentacle-logs
# -----------------------------
# Run container
# -----------------------------
log "Starting Tentacle worker..."
docker run -d \
--pull=always \
--name "${CONTAINER_NAME}" \
--network host \
--runtime=nvidia \
--gpus all \
--restart unless-stopped \
-e CONDUCTOR_ADDR="${ORCH_ADDR}" \
-e WORKER_ID="${WORKER_ID}" \
-v /tmp/tentacle:/tmp/tentacle \
-v /tmp/tentacle-logs:/opt/tentacle/logs \
"${IMAGE}"
# -----------------------------
# Test GPU Access
# -----------------------------
wait_for_container_running() {
local name="$1"
local timeout="${2:-60}"
local elapsed=0
local state=""
while [ "$elapsed" -lt "$timeout" ]; do
state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)"
case "$state" in
running)
return 0
;;
restarting)
sleep 2
elapsed=$((elapsed + 2))
;;
stopped|paused|missing)
break
;;
*)
sleep 2
elapsed=$((elapsed + 2))
;;
esac
done
err "Tentacle container did not reach a running state (last state: ${state})."
docker logs "$name" || true
return 1
}
wait_for_container_running "${CONTAINER_NAME}" 60
if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
err "nvidia-smi exists in the container but failed to access the GPU."
err "Check NVIDIA Container Toolkit installation."
exit 1
fi
log "GPU access verified with nvidia-smi."
elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then
warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible."
warn "The worker can still run, but GPU capability scoring will be unavailable."
else
err "GPU device nodes are not visible inside the container."
err "Check NVIDIA Container Toolkit installation."
exit 1
fi
# -----------------------------
# Done
# -----------------------------
log "Tentacle worker installed and running."
echo
echo " Container: ${CONTAINER_NAME}"
echo " Image: ${IMAGE}"
echo " Logs: docker logs -f ${CONTAINER_NAME}"
echo " Restart: systemctl restart docker"
echo