赞
踩
本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法
备注:
tee prof.py <<-'EOF' import torch import time from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from accelerate import init_empty_weights import sys from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory from torch.cuda.amp import autocast import torch.cuda import multiprocessing as mp import inspect model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) max_memory = {i: "23GB" for i in range(8)} sys.path.insert(0,model_name) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16) model=model.eval() no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention'] device_map = infer_auto_device_map( model,max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype='float16') model = dispatch_model(model, device_map=device_map) model.generation_config = GenerationConfig.from_pretrained(model_name) model.generation_config.pad_token_id = model.generation_config.eos_token_id messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ] input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device) outputs = model.generate(input_tensor, max_new_tokens=100) print("warm up done!") def nsys_prof(): torch.cuda.cudart().cudaProfilerStart() t0=time.time() outputs = model.generate(input_tensor, max_new_tokens=100) t1=time.time() print("e2e:",t1-t0) torch.cuda.cudart().cudaProfilerStop() def torch_prof(): from torch.profiler import profile with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], record_shapes=True, profile_memory=True, with_stack=True, with_flops=True ) as prof: outputs = model.generate(input_tensor, max_new_tokens=100) prof.export_chrome_trace("torch_prof.json") def torch_tensorboard_trace(): prof = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=0), on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'), record_shapes=True, profile_memory=True, with_stack=True) prof.start() for i in range(1): outputs = model.generate(input_tensor, max_new_tokens=100) torch.cuda.synchronize() prof.step() prof.stop() def infer(): outputs = model.generate(input_tensor, max_new_tokens=100) torch.cuda.synchronize() def cprofile(): import cProfile import pstats cProfile.run('infer()', 'restats') p = pstats.Stats('restats') p.sort_stats('cumulative').print_stats(50) def torch_prof_threadpool(): from torch.profiler import profile from concurrent.futures import ThreadPoolExecutor with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], record_shapes=True, profile_memory=True, with_stack=True, with_flops=True ) as prof: def infer(index): past_key_values=None input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device) custom_stream = torch.cuda.Stream() t0=time.time() with torch.cuda.stream(custom_stream): outputs = model.generate(input_tensor,max_new_tokens=100) t1=time.time() print(index,t1-t0) return None with ThreadPoolExecutor(max_workers=4) as executor: results = list(executor.map(infer,range(0,4))) prof.export_chrome_trace("torch_prof_threadpool.json") eval(sys.argv[1])() EOF nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx \ --gpu-metrics-device=0,1,2,3,4,5,6,7 \ --capture-range=cudaProfilerApi \ --capture-range-end=stop python prof.py nsys_prof python prof.py torch_prof mkdir log -p python prof.py torch_tensorboard_trace python prof.py cprofile python prof.py torch_prof_threadpool tee get_traceEvents.py <<-'EOF' import json import sys filepath=sys.argv[1] data=json.load(open(filepath,"r")) f=open(f"traceEvents_{filepath}","w") json.dump({"traceEvents":data['traceEvents']},f) f.close() EOF python get_traceEvents.py torch_prof_threadpool.json python get_traceEvents.py torch_prof.json rm torch_prof.json torch_prof_threadpool.json -f
输出
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- -------- -------- -------- -------- ----------- --------------------------------------------- 66.5 1387919547 200835 6910.7 6470.0 4070 518456 3693.0 cudaLaunchKernel 18.8 391445205 20190 19388.1 17149.5 5360 793835 22195.6 cudaMemcpyAsync 5.6 115988383 19115 6067.9 5810.0 3530 188359 3372.6 cudaMemsetAsync 2.9 60523944 23956 2526.5 1970.0 1240 205099 2145.7 cudaStreamWaitEvent 2.1 43275528 1904 22728.7 3120.0 2630 1948957 88939.7 cudaStreamSynchronize 1.7 36032225 23956 1504.1 1260.0 930 477756 3194.3 cudaEventRecord 1.4 28312023 23956 1181.8 950.0 700 481776 3321.9 cudaEventCreateWithFlags 1.1 22670547 23956 946.3 730.0 570 465647 7535.8 cudaEventDestroy 0.0 170028 100 1700.3 1640.0 1520 2060 140.6 cudaStreamIsCapturing_v10000 0.0 33330 16 2083.1 1715.0 1390 4950 894.8 cudaOccupancyMaxActiveBlocksPerMultiprocessor 0.0 7310 1 7310.0 7310.0 7310 7310 0.0 cuProfilerStart [5/7] Executing 'cuda_gpu_kern_sum' stats report Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- --------- --------- -------- -------- ----------- ---------------------------------------------------------------------------------------------------- 24.6 488154814 18117 26944.6 26720.0 25920 31935 892.4 ampere_bf16_s16816gemm_bf16_64x64_ldg8_f2f_stages_64x6_tn 15.7 312293607 11187 27915.8 21408.0 21152 166178 15652.5 void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x2_tn_align8>(T1::Para… 15.2 302994818 2970 102018.5 101168.5 9376 209283 92340.2 std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n… 9.8 195548625 3150 62078.9 41089.0 4064 91073 27330.1 void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x1_tn_align8>(T1::Para… 8.7 172566020 1485 116206.1 40000.0 38976 1185823 285264.4 void gemv2T_kernel_val<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (int)128, (int… [6/7] Executing 'cuda_gpu_mem_time_sum' stats report Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation -------- --------------- ----- -------- -------- -------- -------- ----------- ------------------ 53.3 847221455 13679 61935.9 1600.0 1120 1117006 155051.3 [CUDA memcpy DtoH] 45.6 724555193 12181 59482.4 2528.0 448 1027370 143492.5 [CUDA memcpy HtoD] 0.6 10113989 19115 529.1 480.0 447 1568 172.0 [CUDA memset] 0.5 8129349 6308 1288.7 1280.0 1185 1728 90.1 [CUDA memcpy DtoD] [7/7] Executing 'cuda_gpu_mem_size_sum' stats report Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation ---------- ----- -------- -------- -------- -------- ----------- ------------------ 6213.111 13679 0.454 0.010 0.000 6.963 1.129 [CUDA memcpy DtoH] 6211.319 12181 0.510 0.010 0.000 6.963 1.184 [CUDA memcpy HtoD] 134.408 6308 0.021 0.000 0.000 0.410 0.088 [CUDA memcpy DtoD] 16.679 19115 0.001 0.000 0.000 0.035 0.005 [CUDA memset] Generated: /home/autotrain/torch_prof/cuda_profing_report.nsys-rep /home/autotrain/torch_prof/cuda_profing_report.sqlite warm up done! e2e: 9.11670708656311 ---------------------------------------------------------------------------------------------------------------- 3206138 function calls (2757755 primitive calls) in 6.868 seconds Ordered by: cumulative time List reduced from 330 to 50 due to restriction <50> ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 6.868 6.868 {built-in method builtins.exec} 1 0.000 0.000 6.868 6.868 <string>:1(<module>) 1 0.000 0.000 6.868 6.868 /home/autotrain/torch_prof/prof.py:75(infer) 1401/1 0.019 0.000 6.868 6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/utils/_contextlib.py:112(decorate_context) 1 0.000 0.000 6.868 6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:1440(generate) 1 0.017 0.017 6.866 6.866 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:2310(_sample) 70800/100 0.064 0.000 6.661 0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1528(_wrapped_call_impl) 70800/100 0.155 0.000 6.661 0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1534(_call_impl) 58500/100 0.182 0.000 6.660 0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:160(new_forward) 100 0.001 0.000 6.613 0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1611(forward) 100 0.007 0.000 6.594 0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1425(forward) 1500 0.059 0.000 6.419 0.004 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1209(forward) 1400 0.035 0.000 3.599 0.003 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:564(forward) 1400 0.242 0.000 2.756 0.002 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:586(moe_infer) 1500 0.253 0.000 2.093 0.001 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:799(forward) 9900 0.229 0.000 1.973 0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:386(forward) 58500 0.070 0.000 1.388 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:316(pre_forward) 198455/117100 0.479 0.000 1.339 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:135(send_to_device) 37300 0.076 0.000 1.090 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/linear.py:115(forward) 38700 1.015 0.000 1.015 0.000 {built-in method torch._C._nn.linear} 89249 0.789 0.000 0.789 0.000 {method 'to' of 'torch._C.TensorBase' objects} 6100 0.278 0.000 0.687 0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:103(forward) 61684/58699 0.068 0.000 0.657 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:73(honor_type) 127539/118584 0.051 0.000 0.557 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:181(<genexpr>) 1400 0.073 0.000 0.370 0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:418(forward) 1500 0.161 0.000 0.357 0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:337(apply_rotary_pos_emb) 738474/604072 0.117 0.000 0.261 0.000 {built-in method builtins.isinstance} 58600 0.020 0.000 0.203 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:189(<dictcomp>) 1400 0.193 0.000 0.193 0.000 {method 'cpu' of 'torch._C.TensorBase' objects} 2985 0.010 0.000 0.160 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/cache_utils.py:123(update) 67201 0.034 0.000 0.159 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:993(__instancecheck__) 9900 0.008 0.000 0.142 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/activation.py:395(forward) 9900 0.010 0.000 0.135 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/functional.py:2080(silu) 67201 0.041 0.000 0.125 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:1154(__subclasscheck__) 7570 0.125 0.000 0.125 0.000 {built-in method torch.cat} 265447 0.079 0.000 0.124 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:44(is_torch_tensor) 9900 0.122 0.000 0.122 0.000 {built-in method torch._C._nn.silu} 3000 0.071 0.000 0.119 0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:329(rotate_half) 155900 0.114 0.000 0.114 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1696(__getattr__) 3000 0.097 0.000 0.097 0.000 {built-in method torch.matmul} 6100 0.086 0.000 0.086 0.000 {method 'mean' of 'torch._C.TensorBase' objects} 5900 0.084 0.000 0.084 0.000 {method 'reshape' of 'torch._C.TensorBase' objects} 6100 0.078 0.000 0.078 0.000 {method 'pow' of 'torch._C.TensorBase' objects} 5600 0.072 0.000 0.072 0.000 {method 'type' of 'torch._C.TensorBase' objects} 1400 0.072 0.000 0.072 0.000 {method 'argsort' of 'torch._C.TensorBase' objects} 203 0.070 0.000 0.070 0.000 {built-in method torch.tensor} 67202 0.025 0.000 0.069 0.000 {built-in method builtins.issubclass} 70800 0.060 0.000 0.060 0.000 {built-in method torch._C._get_tracing_state} 200 0.002 0.000 0.058 0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/logits_process.py:72(__call__) 6100 0.058 0.000 0.058 0.000 {built-in method torch.rsqrt} ----------------------------------------------------------------------------------------------------------------
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。