#!/usr/bin/env bash set -euo pipefail IMAGE_TAG="${TENTACLE_TAG:-latest}" # ----------------------------- # Config # ----------------------------- REGISTRY="hub.krkn.tech" IMAGE_BASE="hub.krkn.tech/krkncli/cuda" CONTAINER_NAME="tentacle" SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0") # ----------------------------- # Helpers # ----------------------------- log() { echo -e "\033[1;32m[*]\033[0m $*"; } warn() { echo -e "\033[1;33m[!]\033[0m $*"; } err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; } # ----------------------------- # Root check # ----------------------------- if [ "$(id -u)" -ne 0 ]; then err "Run as root." exit 1 fi # ----------------------------- # OS check # ----------------------------- if ! command -v apt >/dev/null; then err "This installer currently supports Debian/Ubuntu only." exit 1 fi # ----------------------------- # Ensure Docker # ----------------------------- if ! command -v docker >/dev/null; then log "Installing Docker..." curl -fsSL https://get.docker.com | sh systemctl enable --now docker else log "Docker already installed." fi # ----------------------------- # Ensure NVIDIA driver # ----------------------------- if ! command -v nvidia-smi >/dev/null; then err "NVIDIA driver not detected. Install driver first and reboot." exit 1 fi nvidia-smi || true # ----------------------------- # Ensure NVIDIA container runtime # ----------------------------- if ! command -v nvidia-ctk >/dev/null; then log "Installing NVIDIA container toolkit..." rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list rm -f /usr/share/keyrings/nvidia-toolkit.gpg curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \ > /etc/apt/sources.list.d/nvidia-toolkit.list apt-get update apt-get install -y nvidia-container-toolkit nvidia-ctk runtime configure --runtime=docker systemctl restart docker else log "NVIDIA container toolkit already installed." fi # ----------------------------- # Detect CUDA version # ----------------------------- DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')" log "Detected CUDA capability: ${DETECTED_CUDA}" # Extract major.minor (e.g., 12.4) DETECTED_MM="$(echo "$DETECTED_CUDA" | cut -d. -f1,2)" choose_cuda() { local best="" local best_mm="" for v in "${SUPPORTED_CUDA[@]}"; do v_mm="$(echo "$v" | cut -d. -f1,2)" # Only consider versions with same major.minor if [ "$v_mm" = "$DETECTED_MM" ]; then # Pick highest patch of that minor if [ -z "$best" ] || [ "$(printf '%s\n%s\n' "$best" "$v" | sort -V | tail -n1)" = "$v" ]; then best="$v" fi fi done if [ -n "$best" ]; then echo "$best" return 0 fi return 1 } if CUDA_VERSION="$(choose_cuda)"; then log "Selected CUDA image version: ${CUDA_VERSION}" else err "No compatible CUDA image found for driver capability ${DETECTED_MM}.x" err "Supported versions: ${SUPPORTED_CUDA[*]}" exit 1 fi IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}" # ----------------------------- # Registry login # ----------------------------- docker logout "$REGISTRY" >/dev/null 2>&1 || true REG_USER="krkn-registry" read -rsp "Registry access token: " REG_PASS < /dev/tty echo echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || { err "Docker login failed. Invalid token or registry unreachable." exit 1 } # ----------------------------- # Prompt config # ----------------------------- echo read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty if [ -z "$ORCH_ADDR" ]; then err "Orchestrator address is required." exit 1 fi # ----------------------------- # Normalize orchestrator address # ----------------------------- normalize_orch_addr() { local input="$1" local scheme="" local rest="" local hostport="" local path="" # Extract scheme if present if [[ "$input" =~ ^(https?://)(.*)$ ]]; then scheme="${BASH_REMATCH[1]}" rest="${BASH_REMATCH[2]}" else rest="$input" fi # Split host[:port] and path if [[ "$rest" =~ ^([^/]+)(/.*)?$ ]]; then hostport="${BASH_REMATCH[1]}" path="${BASH_REMATCH[2]:-}" else hostport="$rest" fi # Split host and port local host="$hostport" local port="" if [[ "$hostport" =~ ^(.+):([0-9]+)$ ]]; then host="${BASH_REMATCH[1]}" port="${BASH_REMATCH[2]}" fi # Strip -krkn suffix from the hostname label only # e.g. krkn-krkn.example.com -> krkn.example.com # The conductor servers on the primary host at port 65535 # The -krkn is for the krkns service not the orechstra conductor # foo-krkn -> foo host="$(echo "$host" | sed -E 's/-krkn(\.|$)/\1/')" # If no port, add default if [ -z "$port" ]; then port="65535" fi echo "${scheme}${host}:${port}${path}" } ORCH_ADDR="$(normalize_orch_addr "$ORCH_ADDR")" log "Normalized orchestrator address: ${ORCH_ADDR}" DEFAULT_WORKER_ID="$(hostname)" read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}" # ----------------------------- # Pull image # ----------------------------- log "Pulling image: ${IMAGE}" docker pull "${IMAGE}" || { err "Failed to pull image. Check token permissions or image availability." exit 1 } # ----------------------------- # Stop old container # ----------------------------- log "Cleaning up existing Tentacle container (if any)..." if docker inspect "${CONTAINER_NAME}" >/dev/null 2>&1; then docker stop "${CONTAINER_NAME}" docker rm -f "${CONTAINER_NAME}" fi # ----------------------------- # Create dirs # ----------------------------- mkdir -p /tmp/tentacle mkdir -p /tmp/tentacle-logs # ----------------------------- # Run container # ----------------------------- log "Starting Tentacle worker..." docker run -d \ --name "${CONTAINER_NAME}" \ --network host \ --runtime=nvidia \ --gpus all \ --restart unless-stopped \ -e CONDUCTOR_ADDR="${ORCH_ADDR}" \ -e WORKER_ID="${WORKER_ID}" \ -v /tmp/tentacle:/tmp/tentacle \ -v /tmp/tentacle-logs:/opt/tentacle/logs \ "${IMAGE}" # ----------------------------- # Done # ----------------------------- log "Tentacle worker installed and running." echo echo " Container: ${CONTAINER_NAME}" echo " Image: ${IMAGE}" echo " Logs: docker logs -f ${CONTAINER_NAME}" echo " Restart: systemctl restart docker" echo