常用命令行及详解

Conda 新建环境

conda create --name resnet python=3.8

# pip 换清华源
python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 
# 用清华源安装包：
pip install pybind11 -i https://pypi.tuna.tsinghua.edu.cn/simple

conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/

conda config --set show_channel_urls yes

# 删除conda环境
conda remove --name <env_name> --all -y

**fast_proxy=100.68.166.234:3128
git config --global http.proxy http://100.68.166.234:3128
git config --global https.proxy https://100.68.166.234:3128
export http_proxy=${fast_proxy}
export https_proxy=${fast_proxy}**

Linux

将Windows代码格式转换为Linux格式：sed -i 's/\r$//' run_week_resnet.sh

测试基本语句

查看显存占用：watch nvidia-smi
输出当前的commit ：git rev-parse HEAD
查看当前是否有python进程：

ps -ef | grep python ,  watch pgrep -af python

watch 'pgrep -f python | xargs -r ps -o pid,user,%cpu,%mem,cmd | grep -v grep | grep -v watch'

python -m nvitop

打印CPU信息：lscpu
查看IB版本：ofed_info -s
加权限：chmod +x /data/sunjinfeng/test_data/write_data.sh
查看当前目录下所有文件和子目录大小：du -sh *
当前文件夹数量：ls -la | grep conv2d.weight | wc -l
定时任务crontab -e

# 用指定的 GPU 进行训练
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

Tmux

# 查看有所有tmux会话
指  令：tmux ls
快捷键：Ctrl+b s

# 新建tmux窗口
指  令：tmux new -s <session-name>

# 重命名会话
指  令：tmux rename-session -t <old-name> <new-name>
快捷键：Ctrl+b $

# 分离会话
指  令：tmux detach  或者使用  exit(关闭窗口)
快捷键：Ctrl+b d

# 重新连接会话
指  令：tmux attach -t <session-name>  或者使用 tmux at -t <session-name>

#平铺当前窗格（个人很喜欢的快捷键，注意：平铺的是当前选中的窗格）
快捷键：Ctrl+b z (再次 Ctrl+b z 则恢复)

# 杀死会话
指  令：tmux kill-session -t <session-name>

# 切换会话
指  令：tmux switch -t <session-name>

# 划分上下两个窗格
指  令：tmux split
快捷键：Ctrl+b “

# 划分左右两个窗格
指  令：tmux split -h
快捷键：Ctrl+b %

# 光标切换到上方窗格
指  令：tmux select-pane -U
快捷键：Ctrl+b 方向键上

# 光标切换到下方窗格
指  令：tmux select-pane -D
快捷键：Ctrl+b 方向键下

# 光标切换到左边窗格
指  令：tmux select-pane -L
快捷键：Ctrl+b 方向键左

# 光标切换到右边窗格
指  令：tmux select-pane -R
快捷键：Ctrl+b 方向键右

# 翻页
快捷键：Ctrl+b [

Git

输出当前的commit ：git rev-parse HEAD

git log

提交和修改

git add	添加文件到暂存区
git status	查看仓库当前的状态，显示有变更的文件。
git diff	比较文件的不同，即暂存区和工作区的差异。
git commit	提交暂存区到本地仓库。
git reset	回退版本。
git rm	将文件从暂存区和工作区中删除。
git mv	移动或重命名工作区文件。

远程操作

命令	说明
git remote	远程仓库操作
git fetch	从远程获取代码库
git pull	下载远程代码并合并
git push	上传远程代码并合并

git 提交流程

git status

git add design/design_of_autotest.md

git commit -m test 

git push origin add_autotest:revert-1988-add_autotest

git 设置和取消代理

# 设置代理
git config --global https.proxy http://127.0.0.1:7890

git config --global https.proxy https://127.0.0.1:7890

# 取消代理
git config --global --unset http.proxy

git config --global --unset https.proxy

HuggingFace

# huggingface 的库 clone 下来不会包含 LFS 大文件，模型权重一般需要单独下载

git lfs pull --include="*.bin" 是一个 Git LFS（Large File Storage）命令，用于从远程仓库拉取特定类型的大型文件（`*.bin` 文件）。以下是命令的解释：

- git lfs pull：使用 Git LFS 拉取大型文件。
- --include="*.bin"：指定只拉取符合文件名模式 *.bin 的文件。这里的通配符 * 表示匹配任意字符，而 .bin 则表示匹配以 .bin 结尾的文件名。

使用该命令会触发 Git LFS 的功能，从远程仓库下载所有符合 *.bin 文件名模式的文件，并将它们存储在本地的 Git LFS 存储区中。

Git LFS 是一个用于管理大型文件的扩展，它可以将大型文件存储在与 Git 仓库分离的存储区，以提高 Git 仓库的性能和效率。通过使用 `git lfs pull` 命令并指定 `--include` 参数，可以选择性地拉取特定类型的大型文件。

OneFlow&Libai

安装 Oneflow&libai

python3 -m pip install --pre oneflow -f https://staging.oneflow.info/branch/master/cu117

# 安装特定分支
python3 -m pip install --pre oneflow -f https://staging.oneflow.info/canary/refs/heads/plan_sep_compile_merge/cu117/index.html

# 安装特定commit
python3 -m pip install --pre oneflow -f https://staging.oneflow.info/canary/commit/{commit}/cu117

#验证编译成功 
python3 -m oneflow --doctor
----------------------------------------------------------------------------------------
cd libai
python3 -m pip install -r requirements.txt
python3 -m pip install -e . --user

卸载 Oneflow&libai

python3 -m pip uninstall libai
python3 -m pip uninstall oneflow -y

Docker

基本操作

# 启动和停止容器
docker stop id
docker start id
docker rm id
# 退出并关闭容器
exit
# 退出但保持容器运行
Ctrl + p + q
# 进入正在运行的docker容器
docker attach id 
# commit 容器，保存成新的镜像
docker commit -a "oneflow.org" -m "ib ssh" 6b2270aedfc1 megatron:1.1-py3
# 将镜像保存为 tar 包
docker save -o /data/home/sunjinfeng/megatron/megatron_1.0-py3.tar megatron:1.0-py3
# 加载保存好的镜像
docker load -i /data_turbo/docker_images/ngc_ssh_21.07_py39.tar
# 查看所有容器
docker ps -a
# 查看所有镜像
docker images

Docker run 各参数解释

docker run --gpus all -itd --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --cap-add=IPC_LOCK --name "NCCL_test_sjf" --ipc host --net host -v "/data_32T/home/sunjinfeng/workspace":"/data_32T/home/sunjinfeng/workspace" "ngc/pytorch-21.07:ssh-ib5.3-config-py38"

bash -c "sed -i 's/Port 62620/Port 10035/g' /root/.ssh/config && /usr/sbin/sshd -p 10035 && bash”

这是一个Docker命令，用于启动一个名为"NCCL_test_sjf"的容器，该容器基于"ngc/pytorch-21.07:ssh-ib5.3-config-py38"镜像，并在容器内部执行一个bash shell脚本。

以下是命令中各个参数的含义：

--gpus all：启用所有可用的GPU设备。
-itd：以交互模式运行容器，并将其作为守护进程（即在后台运行）。
--shm-size=16g：设置共享内存的大小为16GB。
--ulimit memlock=-1：设置memlock的软限制和硬限制为无限制，以允许容器锁定任意数量的内存。
--ulimit core=0：禁用core文件的生成。
--ulimit stack=67108864：设置容器中单个线程的堆栈大小限制为64 MB。
--privileged：启用容器的特权模式，以允许容器内部执行一些需要特权访问的操作。
--cap-add=IPC_LOCK：为容器添加IPC_LOCK特权，以允许容器锁定内存中的IPC对象。
--name "NCCL_test_sjf"：将容器命名为"NCCL_test_sjf"。
--ipc host：使用主机的IPC命名空间。
--net host：使用主机的网络命名空间。
-v "/data_32T/home/sunjinfeng/workspace":"/data_32T/home/sunjinfeng/workspace"：将主机的"/data_32T/home/sunjinfeng/workspace"目录挂载到容器的"/data_32T/home/sunjinfeng/workspace"目录中。
bash -c "sed -i 's/Port 62620/Port 10035/g' /root/.ssh/config && /usr/sbin/sshd -p 10035 && bash"：在容器内部执行的命令。该命令首先使用sed命令将"/root/.ssh/config"文件中的"Port 62620"替换为"Port 10035"，然后启动sshd服务，并打开一个新的bash shell。

NCCL-Test

编译NCCL

if [ ! -d "./nccl-tests" ]; then
  git clone https://github.com/NVIDIA/nccl-tests.git
  cd nccl-tests/
  # ngc 容器里通过whereis mpirun 查看
  make MPI=1 MPI_HOME=/usr/local/mpi/bin/mpirun
fi

NCCL 指令参数详解

 `mpirun`: MPI启动命令
 `--allow-run-as-root`: 允许以root用户运行
 `--mca pml ob1`: 使用OpenMPI内置的ob1通信协议
 `--mca btl ^openib,smcuda`: 禁用openib和smcuda通信协议
 `--mca btl_tcp_if_include eth0`: 限制使用eth0网卡进行TCP通信
 `--mca oob_tcp_if_include eth0`: 限制使用eth0网卡进行MPI进程间的控制消息传输
 `-x NCCL_DEBUG=WARN`: 设置NCCL库的调试级别为WARN
 `-x NCCL_IB_HCA`: 指定使用的IB网卡的名称
 `-x LD_LIBRARY_PATH`: 设置动态链接库路径
 `-x PATH`: 设置PATH环境变量
 `-x NCCL_IB_DISABLE=0`: 启用IB网卡
 `-x NCCL_IB_GID_INDEX=3`: 指定IB网卡的GID Index
 `-x NCCL_IB_PCI_RELAXED_ORDERING=1`: 启用PCI relaxed ordering
 `-x NCCL_SOCKET_IFNAME=eth0`: 指定用于NCCL socket通信的网卡
 `-x NCCL_ALGO=RING`: 指定NCCL算法为RING
 `-np 16`: 指定总进程数为16个
 `-N 8`: 每个节点的进程数为8个
 `-hostfile /root/mpi_hostfile`: 指定节点的列表文件
 `/share_nfs/nccl_test/nccl-tests/build/all_reduce_perf`: 指定要运行的NCCL性能测试二进制文件
 `-b 1024M`: 指定每个GPU缓冲区的大小为1024MB
 `-e 1024M`: 指定每个GPU缓冲区的大小为1024MB
 `-n 100`: 指定测试的循环次数为100
 `-g 1`: 指定使用的GPU数量为1个

NCCL 单机 1n8g

./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8

百度测试

mpirun -np 8 -H 127.0.0.1:8 --allow-run-as-root -bind-to none -map-by slot -x NCCL_IB_DISABLE=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_GDR_LEVEL=2 -x NCCL_DEBUG=INFO -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_TC=160 -x NCCL_TOPO_FILE=/root/workspace/nccl-tests/nccl_topo_a800_1.6t.xml -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl_tcp_if_include bond0 -mca btl ^openib /data/sunjinfeng/nccl-tests/build/all_reduce_perf -b 1M -e 1024M -f 2 -g 1 | tee /data/sunjinfeng/nccl-tests/nccl_log/nccl_increace_1n8g_5.25.log

mpirun -np 16 -H 003:8,002:8 --allow-run-as-root -bind-to none -map-by slot -x NCCL_IB_DISABLE=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_GDR_LEVEL=2 -x NCCL_DEBUG=INFO -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_TC=160  -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl_tcp_if_include eth0 -mca btl ^openib /data/sunjinfeng/nccl-tests/build/all_reduce_perf -b 1M -e 1024M -f 2 -g 1 | tee /data/sunjinfeng/nccl-tests/nccl_log/nccl_increace_2n8g_5.25.log

NCCL 单机 2n8g 规定大小

wget https://taco-1251783334.cos.ap-shanghai.myqcloud.com/nccl/nccl_topo_a800_1.6t.xml
# 拓扑文件，在TCCL下不需要这个
-x NCCL_TOPO_FILE=/data_32T/home/sunjinfeng/workspace/nccl_topo_a800_1.6t.xml

mpirun -np 64 -H 030:8,031:8,033:8,038:8,035:8,032:8,036:8,037:8 --allow-run-as-root -bind-to none -map-by slot -x NCCL_IB_DISABLE=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_GDR_LEVEL=2 -x NCCL_DEBUG=INFO -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_TC=160 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl_tcp_if_include bond0 -mca btl ^openib /data_turbo/home/sunjinfeng/workspace/nccl-tests/build/all_reduce_perf -b 2G -e 4G -f 2 -g 1 | tee /data_turbo/home/sunjinfeng/workspace/nccl-tests/nccl_log/nccl_increace_8n8g_5.19.log

NCCL 单机 2n8g 增长步长为2

mpirun -np 16 -H 030:8,031:8 --allow-run-as-root -bind-to none -map-by slot -x NCCL_IB_DISABLE=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_GDR_LEVEL=2 -x NCCL_DEBUG=INFO  -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_TC=160 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl_tcp_if_include bond0 -mca btl ^openib /data_turbo/home/sunjinfeng/workspace/nccl-tests/build/all_reduce_perf -b 2G -e 4G -f 2 -g 1 | tee /data_turbo/home/sunjinfeng/workspace/nccl-tests/nccl_log/nccl_increce.log

火山云测试

设置 github 的节点

fast_proxy=100.68.166.234:3128
git config --global http.proxy http://${fast_proxy}
git config --global https.proxy https://${fast_proxy}

运行 libai

bash tools/args_train.sh configs/gpt2_pretrain.py 1 1 true true true 2 4 false 2 220 100 48 144 2304 9216

运行 Megatron

bash examples/megatron_args_pretrain_gpt2.sh 1 1 true true true 2 4 false 2 220 100 48 144 2304 9216

腾讯云测试

home 目录：/data_32T/home/sunjinfeng/workspace
new_home 目录：/data_turbo/home/sunjinfeng/workspace
通过 xftp 往 Windows 传输文件

cp -rf nccl_increace_8n8g.log /data_32T/home/sunjinfeng/workspace/share/0519/

运行 2卡 libai

bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 0 10.0.0.26 1 1 true true true 2 1 false 2 220 100 48 144 2304 9216
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 1 10.0.0.26 1 1 true true true 2 1 false 2 220 100 48 144 2304 9216
--------------------------------------------------------------------------------
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 0 10.0.0.39 1 1 true true true 2 128 false 2 220 100 48 144 2304 9216
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 1 10.0.0.39 1 1 true true true 2 128 false 2 220 100 48 144 2304 9216

运行多卡 Libai

bash libai_gpt.sh 058,059,060,061 10.0.0.53 4
------------------------------------------------------------------------
bash libai_gpt.sh 030,031,032,033,038,035,036,037 10.0.0.23 1
-------------------------------------------------------------------------------
bash run_train_libai.sh 051,052,053,054,055,056,057,058,059,060,061,062,063,026,027,028 10.0.0.114 1

bash run_libai_gpt.sh 030,031,032,033,038,035,036,037,038,039,041,042,023,044,045,046,047,048,049,050,051,052,053,054,055,056,057,058,059,060,061,062 10.0.0.23 1

# 数据并行和模型并行
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 0 10.0.0.106 2 1 true true true 32 1 false 2 220 100 64 144 2304 9216
# 三并行都开
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 0 10.0.0.106 2 2 true true true 64 1 false 2 220 100 80 144 2304 9216

运行 Megatron

export NCCL_IB_DISABLE=0
export NCCL_DEBUG=INFO
export NCCL_IB_GID_INDEX=3
export NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8
export NCCL_SOCKET_IFNAME=eth0
bash examples/megatron_args_pretrain_gpt2.sh 1 8 0 127.0.0.1 8 1 true true true 8 1 false 0 220 100 24 16 1024 4096

bash examples/megatron_args_pretrain_gpt2.sh 8 8 0 10.0.0.23 2 1 true true true 16 1 false 2 220 100 48 144 2304 9216
bash examples/megatron_args_pretrain_gpt2.sh 2 8 1 10.0.0.23 1 1 true true true 2 1 false 2 20 1 48 144 2304 9216
-------------------------------------------------------------------------------
bash examples/megatron_args_pretrain_gpt2.sh 2 8 0 10.0.0.39 1 1 true true true 2 128 false 2 220 100 48 144 2304 9216
bash examples/megatron_args_pretrain_gpt2.sh 2 8 1 10.0.0.39 1 1 true true true 2 128 false 2 220 100 48 144 2304 9216

运行多卡 Megatron

bash run_train_megatron.sh 030,031,032,033,034,035,036,037 10.0.0.23 1
-------------------------------------------------------------------------------
bash run_train_megatron.sh 051,052,053,054,055,056,057,058,059,060,061,062,063,026,027,028 10.0.0.114 1

# 数据并行和模型并行
bash examples/megatron_args_pretrain_gpt2.sh 8 8 0 10.0.0.23 2 1 true true true 32 1 false 2 220 100 64 144 2304 9216
# 三并行都开
bash tools/args_train.sh configs/gpt2_pretrain.py 2 8 0 10.0.0.106 2 2 true true true 64 1 false 2 220 100 80 144 2304 9216