Files
Tentacle/install.sh

325 lines
7.9 KiB
Bash
Raw Normal View History

2026-01-22 09:50:28 -05:00
#!/usr/bin/env bash
set -euo pipefail
2026-01-22 10:47:38 -05:00
IMAGE_TAG="${TENTACLE_TAG:-latest}"
2026-01-22 09:50:28 -05:00
# -----------------------------
# Config
# -----------------------------
REGISTRY="hub.krkn.tech"
IMAGE_BASE="hub.krkn.tech/krkncli/cuda"
2026-01-22 09:50:28 -05:00
CONTAINER_NAME="tentacle"
SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0")
# -----------------------------
# Helpers
# -----------------------------
log() { echo -e "\033[1;32m[*]\033[0m $*"; }
warn() { echo -e "\033[1;33m[!]\033[0m $*"; }
err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; }
# -----------------------------
# Root check
# -----------------------------
if [ "$(id -u)" -ne 0 ]; then
err "Run as root."
exit 1
fi
# -----------------------------
# OS check
# -----------------------------
if ! command -v apt >/dev/null; then
err "This installer currently supports Debian/Ubuntu only."
exit 1
fi
# -----------------------------
# Ensure Docker
# -----------------------------
if ! command -v docker >/dev/null; then
log "Installing Docker..."
curl -fsSL https://get.docker.com | sh
systemctl enable --now docker
else
log "Docker already installed."
fi
# -----------------------------
# Ensure NVIDIA driver
# -----------------------------
if ! command -v nvidia-smi >/dev/null; then
err "NVIDIA driver not detected. Install driver first and reboot."
exit 1
fi
nvidia-smi || true
# -----------------------------
# Ensure NVIDIA container runtime
# -----------------------------
if ! command -v nvidia-ctk >/dev/null; then
log "Installing NVIDIA container toolkit..."
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list
rm -f /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \
> /etc/apt/sources.list.d/nvidia-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
systemctl restart docker
else
log "NVIDIA container toolkit already installed."
fi
# -----------------------------
# Detect CUDA version
# -----------------------------
DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')"
log "Detected CUDA capability: ${DETECTED_CUDA}"
2026-01-22 11:59:14 -05:00
# Extract major.minor (e.g., 12.4)
DETECTED_MM="$(echo "$DETECTED_CUDA" | cut -d. -f1,2)"
choose_cuda() {
2026-01-22 11:59:14 -05:00
local best=""
local best_mm=""
for v in "${SUPPORTED_CUDA[@]}"; do
v_mm="$(echo "$v" | cut -d. -f1,2)"
# Only consider versions with same major.minor
if [ "$v_mm" = "$DETECTED_MM" ]; then
# Pick highest patch of that minor
if [ -z "$best" ] || [ "$(printf '%s\n%s\n' "$best" "$v" | sort -V | tail -n1)" = "$v" ]; then
best="$v"
fi
fi
done
2026-01-22 11:59:14 -05:00
if [ -n "$best" ]; then
echo "$best"
return 0
fi
return 1
2026-01-22 09:50:28 -05:00
}
if CUDA_VERSION="$(choose_cuda)"; then
2026-01-22 09:50:28 -05:00
log "Selected CUDA image version: ${CUDA_VERSION}"
else
err "No Tentacle image for CUDA ${DETECTED_MM}.x"
err "Driver reports CUDA ${DETECTED_CUDA}"
err "Available images: ${SUPPORTED_CUDA[*]}"
err "Install compatible NVIDIA driver or contact support."
2026-01-22 11:59:14 -05:00
exit 1
2026-01-22 09:50:28 -05:00
fi
2026-01-22 11:59:14 -05:00
2026-01-22 09:50:28 -05:00
IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}"
# -----------------------------
# Registry login
# -----------------------------
2026-01-22 10:46:55 -05:00
docker logout "$REGISTRY" >/dev/null 2>&1 || true
REG_USER="krkn-registry"
REG_PASS="${KRKN_REGISTRY_TOKEN:-}"
if [ -z "$REG_PASS" ]; then
read -rsp "Registry access token: " REG_PASS < /dev/tty
echo
fi
2026-01-22 09:50:28 -05:00
echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || {
err "Docker login failed. Invalid token or registry unreachable."
exit 1
}
2026-01-22 09:50:28 -05:00
# -----------------------------
# Prompt config
# -----------------------------
ORCH_ADDR="${KRKN_ORCH_ADDR:-}"
2026-01-22 09:50:28 -05:00
if [ -z "$ORCH_ADDR" ]; then
echo
read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty
if [ -z "$ORCH_ADDR" ]; then
err "Orchestrator address is required."
exit 1
fi
2026-01-22 09:50:28 -05:00
fi
2026-01-24 19:45:03 -05:00
# -----------------------------
# Normalize orchestrator address
# -----------------------------
normalize_orch_addr() {
local input="$1"
local scheme=""
local rest=""
local hostport=""
local path=""
# Extract scheme if present
if [[ "$input" =~ ^(https?://)(.*)$ ]]; then
scheme="${BASH_REMATCH[1]}"
rest="${BASH_REMATCH[2]}"
else
rest="$input"
fi
# Split host[:port] and path
if [[ "$rest" =~ ^([^/]+)(/.*)?$ ]]; then
hostport="${BASH_REMATCH[1]}"
path="${BASH_REMATCH[2]:-}"
else
hostport="$rest"
fi
# Split host and port
local host="$hostport"
local port=""
if [[ "$hostport" =~ ^(.+):([0-9]+)$ ]]; then
host="${BASH_REMATCH[1]}"
port="${BASH_REMATCH[2]}"
fi
# Strip -krkn suffix from the hostname label only
# e.g. krkn-krkn.example.com -> krkn.example.com
# The conductor servers on the primary host at port 65535
# The -krkn is for the krkns service not the orechstra conductor
# foo-krkn -> foo
host="$(echo "$host" | sed -E 's/-krkn(\.|$)/\1/')"
# If no port, add default
if [ -z "$port" ]; then
port="65535"
fi
echo "${scheme}${host}:${port}${path}"
}
ORCH_ADDR="$(normalize_orch_addr "$ORCH_ADDR")"
log "Normalized orchestrator address: ${ORCH_ADDR}"
2026-01-22 09:50:28 -05:00
DEFAULT_WORKER_ID="$(hostname)"
read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty
2026-01-22 09:50:28 -05:00
WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}"
# -----------------------------
# Pull image
# -----------------------------
log "Pulling image: ${IMAGE}"
docker pull --quiet "${IMAGE}" || {
2026-01-22 10:46:55 -05:00
err "Failed to pull image. Check token permissions or image availability."
exit 1
}
2026-01-22 09:50:28 -05:00
# -----------------------------
# Stop old container
# -----------------------------
cleanup_container() {
local name="$1"
2026-02-04 11:30:22 -05:00
log "Cleaning up existing container: $name"
# If container doesn't exist, nothing to do
if ! docker inspect "$name" >/dev/null 2>&1; then
return 0
fi
# Try graceful stop first
docker stop "$name" >/dev/null 2>&1 || true
# Try force remove
docker rm -f "$name" >/dev/null 2>&1 || true
# Check if it's still there
if docker inspect "$name" >/dev/null 2>&1; then
warn "Docker could not remove $name, attempting manual kill..."
# Get PID
local pid
pid="$(docker inspect -f '{{.State.Pid}}' "$name" 2>/dev/null || echo "")"
if [[ -n "$pid" && "$pid" != "0" ]]; then
warn "Killing stuck container process (PID=$pid)"
kill -9 "$pid" >/dev/null 2>&1 || true
sleep 1
fi
# Final remove attempt
docker rm "$name" >/dev/null 2>&1 || true
fi
# Verify
if docker inspect "$name" >/dev/null 2>&1; then
err "Failed to remove container $name (manual intervention required)"
exit 1
fi
}
cleanup_container "${CONTAINER_NAME}"
2026-01-22 09:50:28 -05:00
# -----------------------------
# Create dirs
# -----------------------------
mkdir -p /tmp/tentacle
mkdir -p /tmp/tentacle-logs
# -----------------------------
# Run container
# -----------------------------
log "Starting Tentacle worker..."
docker run -d \
--pull=always \
2026-01-22 09:50:28 -05:00
--name "${CONTAINER_NAME}" \
--network host \
2026-01-24 18:48:47 -05:00
--runtime=nvidia \
2026-01-22 09:50:28 -05:00
--gpus all \
--restart unless-stopped \
2026-01-22 12:08:12 -05:00
-e CONDUCTOR_ADDR="${ORCH_ADDR}" \
2026-01-22 09:50:28 -05:00
-e WORKER_ID="${WORKER_ID}" \
-v /tmp/tentacle:/tmp/tentacle \
-v /tmp/tentacle-logs:/opt/tentacle/logs \
"${IMAGE}"
# -----------------------------
# Test GPU Access
# -----------------------------
sleep 5
if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then
err "GPU not accessible inside container."
err "Check NVIDIA Container Toolkit installation."
exit 1
fi
log "GPU access verified."
2026-01-22 09:50:28 -05:00
# -----------------------------
# Done
# -----------------------------
log "Tentacle worker installed and running."
echo
echo " Container: ${CONTAINER_NAME}"
echo " Image: ${IMAGE}"
echo " Logs: docker logs -f ${CONTAINER_NAME}"
echo " Restart: systemctl restart docker"
echo