Files
Tentacle/install.sh

187 lines
4.8 KiB
Bash
Raw Normal View History

2026-01-22 09:50:28 -05:00
#!/usr/bin/env bash
set -euo pipefail
# -----------------------------
# Config
# -----------------------------
REGISTRY="hub.krkn.tech"
IMAGE_BASE="hub.krkn.tech/krkn/cuda"
IMAGE_TAG="latest"
CONTAINER_NAME="tentacle"
SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0")
# -----------------------------
# Helpers
# -----------------------------
log() { echo -e "\033[1;32m[*]\033[0m $*"; }
warn() { echo -e "\033[1;33m[!]\033[0m $*"; }
err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; }
# -----------------------------
# Root check
# -----------------------------
if [ "$(id -u)" -ne 0 ]; then
err "Run as root."
exit 1
fi
# -----------------------------
# OS check
# -----------------------------
if ! command -v apt >/dev/null; then
err "This installer currently supports Debian/Ubuntu only."
exit 1
fi
# -----------------------------
# Ensure Docker
# -----------------------------
if ! command -v docker >/dev/null; then
log "Installing Docker..."
curl -fsSL https://get.docker.com | sh
systemctl enable --now docker
else
log "Docker already installed."
fi
# -----------------------------
# Ensure NVIDIA driver
# -----------------------------
if ! command -v nvidia-smi >/dev/null; then
err "NVIDIA driver not detected. Install driver first and reboot."
exit 1
fi
nvidia-smi || true
# -----------------------------
# Ensure NVIDIA container runtime
# -----------------------------
if ! command -v nvidia-ctk >/dev/null; then
log "Installing NVIDIA container toolkit..."
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list
rm -f /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \
> /etc/apt/sources.list.d/nvidia-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
systemctl restart docker
else
log "NVIDIA container toolkit already installed."
fi
# -----------------------------
# Detect CUDA version
# -----------------------------
DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')"
log "Detected CUDA capability: ${DETECTED_CUDA}"
choose_cuda() {
for v in "${SUPPORTED_CUDA[@]}"; do
if [[ "$DETECTED_CUDA" == "$v"* ]]; then
echo "$v"
return
fi
done
# fallback: highest <= detected
for v in $(printf '%s\n' "${SUPPORTED_CUDA[@]}" | sort -rV); do
if [[ "$(printf '%s\n%s\n' "$v" "$DETECTED_CUDA" | sort -V | head -n1)" == "$v" ]]; then
echo "$v"
return
fi
done
echo ""
}
CUDA_VERSION="$(choose_cuda)"
if [ -z "$CUDA_VERSION" ]; then
warn "Could not auto-match CUDA version, defaulting to latest (12.6.3)"
CUDA_VERSION="12.6.3"
else
log "Selected CUDA image version: ${CUDA_VERSION}"
fi
IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}"
# -----------------------------
# Registry login
# -----------------------------
echo
read -rp "Registry username: " REG_USER
read -rsp "Registry password or token: " REG_PASS
echo
echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin
# -----------------------------
# Prompt config
# -----------------------------
echo
read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR
if [ -z "$ORCH_ADDR" ]; then
err "Orchestrator address is required."
exit 1
fi
DEFAULT_WORKER_ID="$(hostname)"
read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID
WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}"
# -----------------------------
# Pull image
# -----------------------------
log "Pulling image: ${IMAGE}"
docker pull "${IMAGE}"
# -----------------------------
# Stop old container
# -----------------------------
docker stop "${CONTAINER_NAME}" 2>/dev/null || true
docker rm "${CONTAINER_NAME}" 2>/dev/null || true
# -----------------------------
# Create dirs
# -----------------------------
mkdir -p /tmp/tentacle
mkdir -p /tmp/tentacle-logs
# -----------------------------
# Run container
# -----------------------------
log "Starting Tentacle worker..."
docker run -d \
--name "${CONTAINER_NAME}" \
--network host \
--gpus all \
--restart unless-stopped \
-e ORCHESTRATOR_ADDRESS="${ORCH_ADDR}" \
-e WORKER_ID="${WORKER_ID}" \
-v /tmp/tentacle:/tmp/tentacle \
-v /tmp/tentacle-logs:/opt/tentacle/logs \
"${IMAGE}"
# -----------------------------
# Done
# -----------------------------
log "Tentacle worker installed and running."
echo
echo " Container: ${CONTAINER_NAME}"
echo " Image: ${IMAGE}"
echo " Logs: docker logs -f ${CONTAINER_NAME}"
echo " Restart: systemctl restart docker"
echo