The script tried to read from stdin wihch is already consumed from the pipe. Altered to read from the TTY not STDIN
197 lines
5.0 KiB
Bash
197 lines
5.0 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
IMAGE_TAG="${TENTACLE_TAG:-latest}"
|
|
|
|
# -----------------------------
|
|
# Config
|
|
# -----------------------------
|
|
REGISTRY="hub.krkn.tech"
|
|
IMAGE_BASE="hub.krkn.tech/krkncli/cuda"
|
|
IMAGE_TAG="latest"
|
|
CONTAINER_NAME="tentacle"
|
|
|
|
SUPPORTED_CUDA=("12.4.1" "12.5.1" "12.6.3" "13.0.1" "13.1.0")
|
|
|
|
# -----------------------------
|
|
# Helpers
|
|
# -----------------------------
|
|
log() { echo -e "\033[1;32m[*]\033[0m $*"; }
|
|
warn() { echo -e "\033[1;33m[!]\033[0m $*"; }
|
|
err() { echo -e "\033[0;31m[✗]\033[0m $*" >&2; }
|
|
|
|
# -----------------------------
|
|
# Root check
|
|
# -----------------------------
|
|
if [ "$(id -u)" -ne 0 ]; then
|
|
err "Run as root."
|
|
exit 1
|
|
fi
|
|
|
|
# -----------------------------
|
|
# OS check
|
|
# -----------------------------
|
|
if ! command -v apt >/dev/null; then
|
|
err "This installer currently supports Debian/Ubuntu only."
|
|
exit 1
|
|
fi
|
|
|
|
# -----------------------------
|
|
# Ensure Docker
|
|
# -----------------------------
|
|
if ! command -v docker >/dev/null; then
|
|
log "Installing Docker..."
|
|
curl -fsSL https://get.docker.com | sh
|
|
systemctl enable --now docker
|
|
else
|
|
log "Docker already installed."
|
|
fi
|
|
|
|
# -----------------------------
|
|
# Ensure NVIDIA driver
|
|
# -----------------------------
|
|
if ! command -v nvidia-smi >/dev/null; then
|
|
err "NVIDIA driver not detected. Install driver first and reboot."
|
|
exit 1
|
|
fi
|
|
|
|
nvidia-smi || true
|
|
|
|
# -----------------------------
|
|
# Ensure NVIDIA container runtime
|
|
# -----------------------------
|
|
if ! command -v nvidia-ctk >/dev/null; then
|
|
log "Installing NVIDIA container toolkit..."
|
|
|
|
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
|
rm -f /usr/share/keyrings/nvidia-toolkit.gpg
|
|
|
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
|
gpg --dearmor -o /usr/share/keyrings/nvidia-toolkit.gpg
|
|
|
|
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
|
sed 's#^deb #deb [signed-by=/usr/share/keyrings/nvidia-toolkit.gpg] #' \
|
|
> /etc/apt/sources.list.d/nvidia-toolkit.list
|
|
|
|
apt-get update
|
|
apt-get install -y nvidia-container-toolkit
|
|
|
|
nvidia-ctk runtime configure --runtime=docker
|
|
systemctl restart docker
|
|
else
|
|
log "NVIDIA container toolkit already installed."
|
|
fi
|
|
|
|
# -----------------------------
|
|
# Detect CUDA version
|
|
# -----------------------------
|
|
DETECTED_CUDA="$(nvidia-smi | grep 'CUDA Version' | sed 's/.*CUDA Version: *\([0-9.]*\).*/\1/')"
|
|
log "Detected CUDA capability: ${DETECTED_CUDA}"
|
|
|
|
choose_cuda() {
|
|
for v in "${SUPPORTED_CUDA[@]}"; do
|
|
if [[ "$DETECTED_CUDA" == "$v"* ]]; then
|
|
echo "$v"
|
|
return
|
|
fi
|
|
done
|
|
|
|
# fallback: highest <= detected
|
|
for v in $(printf '%s\n' "${SUPPORTED_CUDA[@]}" | sort -rV); do
|
|
if [[ "$(printf '%s\n%s\n' "$v" "$DETECTED_CUDA" | sort -V | head -n1)" == "$v" ]]; then
|
|
echo "$v"
|
|
return
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
}
|
|
|
|
CUDA_VERSION="$(choose_cuda)"
|
|
|
|
if [ -z "$CUDA_VERSION" ]; then
|
|
warn "Could not auto-match CUDA version, defaulting to latest (12.6.3)"
|
|
CUDA_VERSION="12.6.3"
|
|
else
|
|
log "Selected CUDA image version: ${CUDA_VERSION}"
|
|
fi
|
|
|
|
IMAGE="${IMAGE_BASE}/${CUDA_VERSION}/tentacle:${IMAGE_TAG}"
|
|
|
|
# -----------------------------
|
|
# Registry login
|
|
# -----------------------------
|
|
docker logout "$REGISTRY" >/dev/null 2>&1 || true
|
|
|
|
REG_USER="krkn-registry"
|
|
|
|
read -rsp "Registry access token: " REG_PASS < /dev/tty
|
|
echo
|
|
|
|
echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin || {
|
|
err "Docker login failed. Invalid token or registry unreachable."
|
|
exit 1
|
|
}
|
|
|
|
# -----------------------------
|
|
# Prompt config
|
|
# -----------------------------
|
|
echo
|
|
read -rp "Orchestrator address (e.g. https://krkn.example.com): " ORCH_ADDR < /dev/tty
|
|
if [ -z "$ORCH_ADDR" ]; then
|
|
err "Orchestrator address is required."
|
|
exit 1
|
|
fi
|
|
|
|
DEFAULT_WORKER_ID="$(hostname)"
|
|
read -rp "Worker ID [${DEFAULT_WORKER_ID}]: " WORKER_ID < /dev/tty
|
|
WORKER_ID="${WORKER_ID:-$DEFAULT_WORKER_ID}"
|
|
|
|
# -----------------------------
|
|
# Pull image
|
|
# -----------------------------
|
|
log "Pulling image: ${IMAGE}"
|
|
docker pull "${IMAGE}" || {
|
|
err "Failed to pull image. Check token permissions or image availability."
|
|
exit 1
|
|
}
|
|
|
|
# -----------------------------
|
|
# Stop old container
|
|
# -----------------------------
|
|
docker stop "${CONTAINER_NAME}" 2>/dev/null || true
|
|
docker rm "${CONTAINER_NAME}" 2>/dev/null || true
|
|
|
|
# -----------------------------
|
|
# Create dirs
|
|
# -----------------------------
|
|
mkdir -p /tmp/tentacle
|
|
mkdir -p /tmp/tentacle-logs
|
|
|
|
# -----------------------------
|
|
# Run container
|
|
# -----------------------------
|
|
log "Starting Tentacle worker..."
|
|
|
|
docker run -d \
|
|
--name "${CONTAINER_NAME}" \
|
|
--network host \
|
|
--gpus all \
|
|
--restart unless-stopped \
|
|
-e ORCHESTRATOR_ADDRESS="${ORCH_ADDR}" \
|
|
-e WORKER_ID="${WORKER_ID}" \
|
|
-v /tmp/tentacle:/tmp/tentacle \
|
|
-v /tmp/tentacle-logs:/opt/tentacle/logs \
|
|
"${IMAGE}"
|
|
|
|
# -----------------------------
|
|
# Done
|
|
# -----------------------------
|
|
log "Tentacle worker installed and running."
|
|
|
|
echo
|
|
echo " Container: ${CONTAINER_NAME}"
|
|
echo " Image: ${IMAGE}"
|
|
echo " Logs: docker logs -f ${CONTAINER_NAME}"
|
|
echo " Restart: systemctl restart docker"
|
|
echo |