赞
踩
docker pull nvcr.io/nvidia/pytorch:24.03-py3
cd /mnt
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update
apt-get install -y nvidia-docker2
nvidia-docker run -ti -e NVIDIA_VISIBLE_DEVICES=all --privileged \
--net=host -v $PWD:/home \
-w /home --rm nvcr.io/nvidia/pytorch:24.03-py3 /bin/bash
pip install transformers
pip install deepspeed
git clone https://github.com/microsoft/Megatron-DeepSpeed
cd Megatron-DeepSpeed
git checkout 3c5f47563f697702c1e305fa01b7563f54b747fc
python3 setup.py install
apt update apt install -y openssh-server apt install -y openmpi-bin openmpi-doc libopenmpi-dev rm -rf ~/.ssh/* ssh-keygen sed -i 's/^.*PermitRootLogin.*$/PermitRootLogin yes/g' /etc/ssh/sshd_config sed -i 's/^.*Port.*$/Port 2223/g' /etc/ssh/sshd_config export passwd=Hello123 && printf "${passwd}\n${passwd}\n" | passwd root cat >/usr/bin/run.sh <<EOF #!/bin/bash mkdir -p /run/sshd source ~/.bashrc /usr/sbin/sshd -D EOF chmod 777 /usr/bin/run.sh nohup /usr/bin/run.sh & tee ~/.ssh/config <<-'EOF' Host worker_1 User root Hostname 192.168.1.100 port 2223 IdentityFile ~/.ssh/id_rsa Host worker_2 User root Hostname 192.168.1.101 port 2223 IdentityFile ~/.ssh/id_rsa EOF
ssh-copy-id worker_1
ssh-copy-id worker_2
wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pdsh/pdsh-2.29.tar.bz2
tar -xf pdsh-2.29.tar.bz2
cd pdsh-2.29
./configure --with-ssh
make -j
make install
cp /usr/local/bin/pdsh /usr/bin/
pip install --upgrade protobuf==3.20.1
cd /home/Megatron-DeepSpeed
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz
python3 tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab-file gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
cd /home/Megatron-DeepSpeed tee hostfile <<-'EOF' worker_1 slots=1 worker_2 slots=1 EOF tee ds_config.json <<-'EOF' { "train_micro_batch_size_per_gpu": 1, "train_batch_size": 16, "gradient_clipping": 1.0, "zero_optimization": { "stage": 1 }, "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 500, "hysteresis": 2, "min_loss_scale": 1, "initial_scale_power": 12 }, "steps_per_print": 2000, "wall_clock_breakdown": false } EOF
export MAX_JOBS=8 export NCCL_DEBUG=info export NCCL_SOCKET_IFNAME=enp5s0 export NCCL_IB_DISABLE=1 deepspeed --hostfile ./hostfile pretrain_gpt.py \ --tensor-model-parallel-size 2 \ --pipeline-model-parallel-size 1 \ --distributed-backend nccl \ --num-layers 2 \ --hidden-size 8 \ --num-attention-heads 2 \ --seq-length 512 \ --max-position-embeddings 512 \ --micro-batch-size 1 \ --rampup-batch-size 2 2 1_000 \ --global-batch-size 16 \ --train-samples 10_000 \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --lr 1e-4 \ --log-interval 1 \ --lr-warmup-samples 5 \ --min-lr 1e-6 \ --lr-decay-style cosine \ --lr-decay-samples 12 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ --fp16 \ --partition-activations \ --seed 42 \ --vocab-file gpt2-vocab.json \ --merge-file gpt2-merges.txt \ --exit-interval 100 --log-interval 10 \ --save-interval 50 --eval-interval 100 \ --eval-iters 10 --checkpoint-activations \ --save checkpoints/gpt2_4 \ --data-path my-gpt2_text_document \ --tensorboard-dir output_dir/tensorboard \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ --deepspeed \ --deepspeed_config ./ds_config.json \ --zero-stage 1 --deepspeed-activation-checkpointing
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。