Skip to Main Content
Research & Decision Installing Conda and PyTorch on NVIDIA DGX Spark

Llama.swap

Llama
llama-swap (mostlygeek/llama-swap) is a lightweight, open-source proxy server designed for local LLM users, particularly those using llama.cpp. It automatically manages the loading and unloading of different AI models in response to API requests, enabling seamless, dynamic model switching without manual intervention.

 Intereface

Commands

sudo systemctl daemon-reload
sudo systemctl start llama-swap.service
sudo systemctl status llama-swap.service
journalctl -u  llama-swap.service -n 80 --no-pager
sudo systemctl stop llama-swap.service
sudo systemctl reset-failed llama-server.service

Installation

cd $HOME/codebase
sudo apt update
sudo apt install nodejs npm golang-go
git clone https://github.com/mostlygeek/llama-swap
cd llama-swap
make clean all
ls -all $HOME/codebase/llama-swap/build/llama-swap-linux-arm64
sudo cp $HOME/codebase/llama-swap/build/llama-swap-linux-arm64 \
/usr/local/bin/llama-swap
sudo chown sysadmin:sysadmin /usr/local/bin/llama-swap

Configuration

Filename: /etc/default/llama-swap.profile

GGML_RPC_MAX_CHUNK=33554432
LLAMA_LOG_COLORS=1
LLAMA_LOG_PREFIX=1
LLAMA_LOG_TIMESTAMPS=1
LD_LIBRARY_PATH=/home/sysadmin/codebase/llama.cpp/build/bin 

Startup

# Manual Validation
/usr/local/bin/llama-swap -config /etc/default/llama-swap.profile

Systemctl

Filename: /etc/systemd/system$ cat llama-server.service

vi /etc/systemd/system$ cat llama-server.service
# Paste Block Below
# Save
chown root:root /etc/systemd/system$ cat llama-server.service
[Unit]
Description=LLAMA Server
Wants=network-online.target
After=network-online.target docker.service openwebui.service
Requires=docker.service openwebui.service
StartLimitIntervalSec=300
StartLimitBurst=10

[Service]
Type=simple
User=sysadmin
Group=sysadmin
SupplementaryGroups=docker

Environment=LLAMA_LOG_COLORS=1
Environment=LLAMA_LOG_PREFIX=1
Environment=LLAMA_LOG_TIMESTAMPS=1
EnvironmentFile=/etc/default/llama-server.profile
ExecStart=/usr/local/bin/llama-server-start

# Let systemd stop the main process cleanly (SIGTERM below).
# ExecStop is optional; keep only if it actually performs shutdown logic.
# ExecStop=/home/sysadmin/bin/llama.server.stop.sh
Restart=always
RestartSec=10

KillSignal=SIGTERM
TimeoutStopSec=60
KillMode=mixed

# Logging (file-based). Ensure /var/log/llama exists and is writable as sysadmin.
StandardOutput=append:/var/log/llama/llama-server.log
StandardError=append:/var/log/llama/llama-server.log
# Optional: create log dir with correct perms at service start (systemd v235+)
LogsDirectory=llama

# Hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=full
ProtectHome=false
ReadWritePaths=/var/log/llama

[Install]
WantedBy=multi-user.target

Log Rotate

# logrotate rule
sudo vi /etc/logrotate.d/llama-swap
# Cut from
/var/log/llama/llama-swap.log {
    daily
    rotate 7
    compress
    missingok
    notifempty
    copytruncate
}
# Cut to
#
sudo chown root:root /etc/logrotate.d/llama-swap
#
ls -all /etc/logrotate.d/llama-swap
#
sudo logrotate -f /etc/logrotate.d/llama-swap
#