#!/usr/bin/env bash set -euo pipefail IMAGE_TAG="${TENTACLE_TAG:-latest}" # ----------------------------- # Config # ----------------------------- REGISTRY="hub.krkn.tech" IMAGE_BASE="hub.krkn.tech/krakentech/cuda" CONTAINER_NAME="tentacle" SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0") # ----------------------------- # Helpers # ----------------------------- log() { echo -e "\033[1;32m[*]\033[0m $*"; } warn() { echo -e "\033[1;33m[!]\033[0m $*"; } err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; } # ----------------------------- # Root check # ----------------------------- if [ "$(id -u)" -ne 0 ]; then err "Run as root." exit 1 fi # ----------------------------- # OS check # ----------------------------- if ! command -v apt >/dev/null; then err "This installer currently supports Debian/Ubuntu only." exit 1 fi # ----------------------------- # Ensure Docker # ----------------------------- if ! command -v docker >/dev/null; then log "Installing Docker..." curl -fsSL https://get.docker.com | sh systemctl enable --now docker else log "Docker already installed." fi # ----------------------------- # Ensure NVIDIA driver # ----------------------------- if ! command -v nvidia-smi >/dev/null; then err "NVIDIA driver not detected. Install driver first and reboot." exit 1 fi nvidia-smi || true # ----------------------------- # Ensure NVIDIA container runtime # ----------------------------- if ! command -v nvidia-ctk >/dev/null; then log "Installing NVIDIA container toolkit..." rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list rm -f /usr/share/keyrings/nvidia-toolkit.gpg curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \ > /etc/apt/sources.list.d/nvidia-toolkit.list apt-get update apt-get install -y nvidia-container-toolkit nvidia-ctk runtime configure --runtime=docker systemctl restart docker else log "NVIDIA container toolkit already installed." fi # ----------------------------- # Detect CUDA version # ----------------------------- DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')" log "Detected CUDA capability: ${DETECTED_CUDA}" # Extract major.minor (e.g., 12.4) DETECTED_MM="$(echo "$DETECTED_CUDA" | cut -d. -f1,2)" choose_cuda() { local best="" local best_mm="" for v in "${SUPPORTED_CUDA[@]}"; do v_mm="$(echo "$v" | cut -d. -f1,2)" # Only consider versions with same major.minor if [ "$v_mm" = "$DETECTED_MM" ]; then # Pick highest patch of that minor if [ -z "$best" ] || [ "$(printf '%s\n%s\n' "$best" "$v" | sort -V | tail -n1)" = "$v" ]; then best="$v" fi fi done if [ -n "$best" ]; then echo "$best" return 0 fi return 1 } if CUDA_VERSION="$(choose_cuda)"; then log "Selected CUDA image version: ${CUDA_VERSION}" else err "No Tentacle image for CUDA ${DETECTED_MM}.x" err "Driver reports CUDA ${DETECTED_CUDA}" err "Available images: ${SUPPORTED_CUDA[*]}" err "Install compatible NVIDIA driver or contact support." exit 1 fi IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}" # ----------------------------- # Registry login # ----------------------------- docker logout "$REGISTRY" >/dev/null 2>&1 || true REG_USER="krkn-registry" REG_PASS="${KRKN_REGISTRY_TOKEN:-}" if [ -z "$REG_PASS" ]; then read -rsp "Registry access token: " REG_PASS < /dev/tty echo fi echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || { err "Docker login failed. Invalid token or registry unreachable." exit 1 } # ----------------------------- # Prompt config # ----------------------------- ORCH_ADDR="${KRKN_ORCH_ADDR:-}" if [ -z "$ORCH_ADDR" ]; then echo read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty if [ -z "$ORCH_ADDR" ]; then err "Orchestrator address is required." exit 1 fi fi # ----------------------------- # Normalize orchestrator address # ----------------------------- normalize_orch_addr() { local input="$1" local scheme="" local rest="" local hostport="" local path="" # Extract scheme if present if [[ "$input" =~ ^(https?://)(.*)$ ]]; then scheme="${BASH_REMATCH[1]}" rest="${BASH_REMATCH[2]}" else rest="$input" fi # Split host[:port] and path if [[ "$rest" =~ ^([^/]+)(/.*)?$ ]]; then hostport="${BASH_REMATCH[1]}" path="${BASH_REMATCH[2]:-}" else hostport="$rest" fi # Split host and port local host="$hostport" local port="" if [[ "$hostport" =~ ^(.+):([0-9]+)$ ]]; then host="${BASH_REMATCH[1]}" port="${BASH_REMATCH[2]}" fi # Strip -krkn suffix from the hostname label only # e.g. krkn-krkn.example.com -> krkn.example.com # The conductor servers on the primary host at port 65535 # The -krkn is for the krkns service not the orechstra conductor # foo-krkn -> foo host="$(echo "$host" | sed -E 's/-krkn(\.|$)/\1/')" # If no port, add default if [ -z "$port" ]; then port="65535" fi echo "${scheme}${host}:${port}${path}" } ORCH_ADDR="$(normalize_orch_addr "$ORCH_ADDR")" log "Normalized orchestrator address: ${ORCH_ADDR}" DEFAULT_WORKER_ID="$(hostname)" read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}" # ----------------------------- # Pull image # ----------------------------- log "Pulling image: ${IMAGE}" docker pull --quiet "${IMAGE}" || { err "Failed to pull image. Check token permissions or image availability." exit 1 } # ----------------------------- # Stop old container # ----------------------------- cleanup_container() { local name="$1" log "Cleaning up existing container: $name" # If container doesn't exist, nothing to do if ! docker inspect "$name" >/dev/null 2>&1; then return 0 fi # Try graceful stop first docker stop "$name" >/dev/null 2>&1 || true # Try force remove docker rm -f "$name" >/dev/null 2>&1 || true # Check if it's still there if docker inspect "$name" >/dev/null 2>&1; then warn "Docker could not remove $name, attempting manual kill..." # Get PID local pid pid="$(docker inspect -f '{{.State.Pid}}' "$name" 2>/dev/null || echo "")" if [[ -n "$pid" && "$pid" != "0" ]]; then warn "Killing stuck container process (PID=$pid)" kill -9 "$pid" >/dev/null 2>&1 || true sleep 1 fi # Final remove attempt docker rm "$name" >/dev/null 2>&1 || true fi # Verify if docker inspect "$name" >/dev/null 2>&1; then err "Failed to remove container $name (manual intervention required)" exit 1 fi } cleanup_container "${CONTAINER_NAME}" # ----------------------------- # Create dirs # ----------------------------- mkdir -p /tmp/tentacle mkdir -p /tmp/tentacle-logs # ----------------------------- # Run container # ----------------------------- log "Starting Tentacle worker..." docker run -d \ --pull=always \ --name "${CONTAINER_NAME}" \ --network host \ --runtime=nvidia \ --gpus all \ --restart unless-stopped \ -e CONDUCTOR_ADDR="${ORCH_ADDR}" \ -e WORKER_ID="${WORKER_ID}" \ -v /tmp/tentacle:/tmp/tentacle \ -v /tmp/tentacle-logs:/opt/tentacle/logs \ "${IMAGE}" # ----------------------------- # Test GPU Access # ----------------------------- wait_for_container_running() { local name="$1" local timeout="${2:-60}" local elapsed=0 local state="" while [ "$elapsed" -lt "$timeout" ]; do state="$(docker inspect -f '{{if .State.Running}}running{{else if .State.Restarting}}restarting{{else if .State.Paused}}paused{{else}}stopped{{end}}' "$name" 2>/dev/null || echo missing)" case "$state" in running) return 0 ;; restarting) sleep 2 elapsed=$((elapsed + 2)) ;; stopped|paused|missing) break ;; *) sleep 2 elapsed=$((elapsed + 2)) ;; esac done err "Tentacle container did not reach a running state (last state: ${state})." docker logs "$name" || true return 1 } wait_for_container_running "${CONTAINER_NAME}" 60 if docker exec "${CONTAINER_NAME}" sh -lc 'command -v nvidia-smi >/dev/null 2>&1'; then if ! docker exec "${CONTAINER_NAME}" nvidia-smi >/dev/null 2>&1; then err "nvidia-smi exists in the container but failed to access the GPU." err "Check NVIDIA Container Toolkit installation." exit 1 fi log "GPU access verified with nvidia-smi." elif docker exec "${CONTAINER_NAME}" sh -lc 'test -c /dev/nvidiactl || test -c /dev/nvidia0'; then warn "nvidia-smi is not present in the container image; NVIDIA device nodes are visible." warn "The worker can still run, but GPU capability scoring will be unavailable." else err "GPU device nodes are not visible inside the container." err "Check NVIDIA Container Toolkit installation." exit 1 fi # ----------------------------- # Done # ----------------------------- log "Tentacle worker installed and running." echo echo " Container: ${CONTAINER_NAME}" echo " Image: ${IMAGE}" echo " Logs: docker logs -f ${CONTAINER_NAME}" echo " Restart: systemctl restart docker" echo