debian下的运行脚本
#!/bin/bashset -euxo pipefail
export DEBIAN_FRONTEND=noninteractive
sudo dpkg --set-selections <<< "cloud-init install" || true
#设置全局变量
# 检测系统
OS="$(uname)"
case $OS in
"Linux")
# 判断下系统发行版
if [ -f /etc/os-release ]; then
. /etc/os-release
DISTRO=$ID
VERSION=$VERSION_ID
else
echo "Linux 发行版不受支持。"
exit 1
fi
;;
esac
# 检测是否存在 Nvidia GPU
NVIDIA_PRESENT=$(lspci | grep -i nvidia || true)
# 如果检测到 Nvidia 设备,则继续执行 Nvidia 相关步骤
if [[ -z "$NVIDIA_PRESENT" ]]; then
echo "NVIDIA驱动不存在,准备安装."
else
# 查看nvidia-smi是否正常工作
if command -v nvidia-smi &>/dev/null; then
echo "CUDA drivers 已经安装 nvidia-smi 工作正常."
else
case $DISTRO in
"debian")
case $VERSION in
"10"|"11")
# 支持 Debian 10 & 11
sudo -- sh -c 'apt update; apt upgrade -y; apt autoremove -y; apt autoclean -y'
sudo apt install linux-headers-$(uname -r) -y
sudo apt install linux-hwe-y
sudo apt install linux-modules-$(uname -r)-y
sudo apt install linux-modules-extra-$(uname -r)-y
# 重新安装网卡驱动
sudo apt-get install r8168-dkms -y
sudo apt update -y
sudo apt install nvidia-driver firmware-misc-nonfree
wget https://developer.download.nvidia.com/compute/cuda/repos/debian${VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit
sudo apt update -y
;;
*)
echo "debian 版本不被支持"
exit 1
;;
esac
;;
*)
echo "只搞 debian 不干其它,换个系统."
exit 1
;;
esac
echo " 5 秒后,即将 reboot !!! 重启后,再干一遍这个脚本,确保驱动完整 !"
sleep 5s
sudo reboot
fi
fi
#测试驱动
if [[ ! -z "$NVIDIA_PRESENT" ]]; then
nvidia-smi
fi
# 查看 docker 是否安装
if command -v docker &>/dev/null; then
echo "Docker 已经安装."
else
echo "Docker 没有安装..."
# 安装 Docker-ce keyring
sudo apt update -y
sudo apt install -y ca-certificates curl gnupg
sudo install -m 0755 -d /etc/apt/keyrings
FILE=/etc/apt/keyrings/docker.gpg
if [ -f "$FILE" ]; then
sudo rm "$FILE"
fi
curl -fsSL https://download.docker.com/linux/debian/gpg | sudo gpg --dearmor -o "$FILE"
sudo chmod a+r /etc/apt/keyrings/docker.gpg
# 添加 Docker-ce repository 到 Apt sources 并且 install
echo \
"deb https://download.docker.com/linux/debian \
$(. /etc/os-release; echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update -y
sudo apt -y install docker-ce
fi
# 检查 docker-compose是否安装
if command -v docker-compose &>/dev/null; then
echo "Docker-compose 存在."
else
echo "Docker-compose 不存在. 干它一波..."
# 安装 docker-compose subcommand
sudo apt -y install docker-compose-plugin
sudo ln -sv /usr/libexec/docker/cli-plugins/docker-compose /usr/bin/docker-compose
docker-compose --version
fi
# Test / 安装 nvidia-docker
if [[ ! -z "$NVIDIA_PRESENT" ]]; then
if sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi &>/dev/null; then
echo "nvidia-docker 开启且已经正常工作"
else
echo "nvidia-docker没工作. 准备安装..."
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
sudo docker run --gpus all nvidia/cuda:11.0.3-base-ubuntu18.04 nvidia-smi
fi
fi
sudo apt-mark hold nvidia* libnvidia*
# 添加 docker group 和 user 到 group docker
sudo groupadd docker || true
sudo usermod -aG docker $USER || true
newgrp docker || true
# 解决 NVIDIA Docker 问题
echo "正在应用解决方案,针对 NVIDIA Docker 的问题,参考 https://github.com/NVIDIA/nvidia-docker/issues/1730"
# 问题及解决方案概要:
# 问题发生在主机执行 daemon-reload 时,这可能导致使用 systemd 的容器失去对 NVIDIA GPU 的访问。
# 要检查是否受影响,在主机上运行 `sudo systemctl daemon-reload`,然后在容器中用 `nvidia-smi` 检查 GPU 访问。
# 如果受影响,请按照以下解决方案进行处理。
# 解决方案步骤:
# 禁用 Docker 容器的 cgroups,以防止出现问题。
# 编辑 Docker 守护进程配置。
sudo bash -c 'cat <<EOF > /etc/docker/daemon.json
{
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"exec-opts": ["native.cgroupdriver=cgroupfs"]
}
EOF'
# 重启 Docker 以应用更改。
sudo systemctl restart docker
echo "解决方案已应用。Docker 已配置为使用 'cgroupfs' 作为 cgroup 驱动。"
页:
[1]