115 lines
3.5 KiB
Nix
115 lines
3.5 KiB
Nix
{
|
|
pkgs,
|
|
osConfig,
|
|
lib,
|
|
...
|
|
}:
|
|
|
|
let
|
|
isPC = osConfig.networking.hostName == "pc-mans";
|
|
|
|
# AMD's nightly image: ROCm + PyTorch + vLLM prebuilt with RDNA 4 kernel
|
|
# coverage. The stable `rocm/vllm:latest` tag lags and may not include
|
|
# gfx1201 yet — recheck on Docker Hub before pinning to a release tag.
|
|
vllmImage = "rocm/vllm-dev:nightly";
|
|
openWebUIImage = "ghcr.io/open-webui/open-webui:main";
|
|
|
|
# defaultModel = "TinyLlama/TinyLlama-1.1B-Chat-v1.0";
|
|
defaultModel = "Qwen/Qwen2.5-14B-Instruct";
|
|
|
|
vllm-serve = pkgs.writeShellApplication {
|
|
name = "vllm-serve";
|
|
runtimeInputs = [ pkgs.docker ];
|
|
text = ''
|
|
MODEL="''${VLLM_MODEL:-${defaultModel}}"
|
|
if [ $# -gt 0 ]; then
|
|
MODEL="$1"
|
|
shift
|
|
fi
|
|
mkdir -p "$HOME/.cache/huggingface"
|
|
exec docker run --rm \
|
|
--name vllm \
|
|
--device=/dev/kfd --device=/dev/dri \
|
|
--security-opt seccomp=unconfined \
|
|
--group-add video --group-add render \
|
|
--ipc=host --shm-size=16G \
|
|
--env PYTORCH_HIP_ALLOC_CONF=expandable_segments:True \
|
|
-p 8000:8000 \
|
|
-v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
|
|
${vllmImage} \
|
|
vllm serve "$MODEL" "$@"
|
|
'';
|
|
};
|
|
|
|
open-webui-run = pkgs.writeShellApplication {
|
|
name = "open-webui-run";
|
|
runtimeInputs = [ pkgs.docker ];
|
|
text = ''
|
|
mkdir -p "$HOME/.local/share/open-webui"
|
|
exec docker run --rm \
|
|
--name open-webui \
|
|
-p 3000:8080 \
|
|
-e "OPENAI_API_BASE_URL=http://host.docker.internal:8000/v1" \
|
|
-e "OPENAI_API_KEY=dummy" \
|
|
-e "WEBUI_AUTH=False" \
|
|
--add-host=host.docker.internal:host-gateway \
|
|
-v "$HOME/.local/share/open-webui:/app/backend/data" \
|
|
${openWebUIImage}
|
|
'';
|
|
};
|
|
in
|
|
lib.mkIf isPC {
|
|
home.packages = [
|
|
vllm-serve
|
|
open-webui-run
|
|
];
|
|
|
|
# `systemctl --user start vllm` to launch the server,
|
|
# `journalctl --user -fu vllm` to follow logs.
|
|
# Override the model with `systemctl --user edit vllm` and set
|
|
# Environment=VLLM_MODEL=org/name.
|
|
systemd.user.services.vllm = {
|
|
Unit = {
|
|
Description = "vLLM OpenAI-compatible server (ROCm Docker)";
|
|
After = [ "docker.service" ];
|
|
Wants = [ "docker.service" ];
|
|
};
|
|
Service = {
|
|
ExecStartPre = "-${pkgs.docker}/bin/docker rm -f vllm";
|
|
ExecStart = "${vllm-serve}/bin/vllm-serve";
|
|
ExecStop = "${pkgs.docker}/bin/docker stop -t 30 vllm";
|
|
Restart = "on-failure";
|
|
RestartSec = "10s";
|
|
# First start pulls a ~20 GB image.
|
|
TimeoutStartSec = "30min";
|
|
};
|
|
};
|
|
|
|
# `systemctl --user start open-webui` then browse http://localhost:3000.
|
|
# Starts vLLM as a dependency so one command brings up both.
|
|
# WEBUI_AUTH=False disables the login wall — fine for localhost-only use;
|
|
# remove if you ever expose port 3000 outside this host.
|
|
systemd.user.services.open-webui = {
|
|
Unit = {
|
|
Description = "Open WebUI (chat UI for the vLLM endpoint)";
|
|
After = [
|
|
"docker.service"
|
|
"vllm.service"
|
|
];
|
|
Wants = [
|
|
"docker.service"
|
|
"vllm.service"
|
|
];
|
|
};
|
|
Service = {
|
|
ExecStartPre = "-${pkgs.docker}/bin/docker rm -f open-webui";
|
|
ExecStart = "${open-webui-run}/bin/open-webui-run";
|
|
ExecStop = "${pkgs.docker}/bin/docker stop -t 30 open-webui";
|
|
Restart = "on-failure";
|
|
RestartSec = "10s";
|
|
# First start pulls ~5 GB.
|
|
TimeoutStartSec = "15min";
|
|
};
|
|
};
|
|
}
|