{"id":40335,"date":"2025-05-28T13:43:56","date_gmt":"2025-05-28T05:43:56","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/40335.html"},"modified":"2025-05-28T13:43:56","modified_gmt":"2025-05-28T05:43:56","slug":"%e6%91%a9%e5%b0%94%e7%ba%bf%e7%a8%8bs4000%e5%9b%bd%e4%ba%a7%e4%bf%a1%e5%88%9b%e8%ae%a1%e7%ae%97%e5%8d%a1%e6%80%a7%e8%83%bd%e5%ae%9e%e6%88%98-pytorch%e8%bd%ac%e8%af%91%ef%bc%8c","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/40335.html","title":{"rendered":"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b"},"content":{"rendered":"<h2>\u7b80\u4ecb<\/h2>\n<p>MTT S4000 \u662f\u57fa\u4e8e\u6469\u5c14\u7ebf\u7a0b\u66f2\u9662 GPU \u67b6\u6784\u6253\u9020\u7684\u5168\u529f\u80fd\u5143\u8ba1\u7b97\u5361&#xff0c;\u4e3a\u5343\u4ebf\u89c4\u6a21\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u8fdb\u884c\u4e86\u5b9a\u5236\u4f18\u5316&#xff0c;\u7ed3\u5408\u5148\u8fdb\u7684\u56fe\u5f62\u6e32\u67d3\u80fd\u529b\u3001\u89c6\u9891\u7f16\u89e3\u7801\u80fd\u529b\u548c\u8d85\u9ad8\u6e05 8K HDR \u663e\u793a\u80fd\u529b&#xff0c;\u52a9\u529b\u4eba\u5de5\u667a\u80fd\u3001\u56fe\u5f62\u6e32\u67d3\u3001\u591a\u5a92\u4f53\u3001\u79d1\u5b66\u8ba1\u7b97\u4e0e\u7269\u7406\u4eff\u771f\u7b49\u590d\u5408\u5e94\u7528\u573a\u666f\u7684\u8ba1\u7b97\u52a0\u901f\u3002<\/p>\n<p>MTT S4000 \u5168\u9762\u652f\u6301\u5927\u8bed\u8a00\u6a21\u578b\u7684\u9884\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u670d\u52a1&#xff0c;MUSA \u8f6f\u4ef6\u6808\u4e13\u95e8\u9488\u5bf9\u5927\u89c4\u6a21\u96c6\u7fa4\u7684\u5206\u5e03\u5f0f\u8ba1\u7b97\u6027\u80fd\u8fdb\u884c\u4e86\u4f18\u5316&#xff0c;\u9002\u914d\u4e3b\u6d41\u5206\u5e03\u5f0f\u8ba1\u7b97\u52a0\u901f\u6846\u67b6&#xff0c; \u5305\u62ec DeepSpeed&#xff0c; Colossal AI&#xff0c;Megatron \u7b49&#xff0c;\u652f\u6301\u5343\u4ebf\u53c2\u6570\u5927\u8bed\u8a00\u6a21\u578b\u7684\u7a33\u5b9a\u9884\u8bad\u7ec3\u3002<\/p>\n<p>\u5b98\u65b9\u53c2\u6570\u5982\u4e0b<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"1298\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054352-6836a29866a81.png\" width=\"1418\" \/><\/p>\n<h2>\u8fd0\u884c\u73af\u5883<\/h2>\n<p>\u672c\u6b21\u8fd0\u884c\u73af\u5883\u4e3aAutoDL\u4e91\u4e2d\u7684\u955c\u50cf\u73af\u5883&#xff0c;\u7cfb\u7edf\u73af\u5883\u5982\u4e0b<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"237\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054352-6836a298ecadc.png\" width=\"719\" \/><\/p>\n<h2>\u5e38\u7528\u547d\u4ee4<\/h2>\n<h3>\u663e\u5361\u8fd0\u884c\u72b6\u6001<\/h3>\n<p>\u8f93\u5165\u5982\u4e0b\u547d\u4ee4<\/p>\n<p>mthreads-gmi<\/p>\n<p>\u5373\u53ef\u67e5\u770b\u5f53\u524d\u663e\u5361\u8fd0\u884c\u72b6\u6001<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"617\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054353-6836a2990e21d.png\" width=\"648\" \/><\/p>\n<h3>\u67e5\u770b\u5f53\u524dGPU\u8be6\u7ec6\u4fe1\u606f<\/h3>\n<p>\u8f93\u5165<\/p>\n<p>musaInfo <\/p>\n<p>\u5373\u53ef<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"897\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054353-6836a2992c4c3.png\" width=\"803\" \/><\/p>\n<h3>\u67e5\u770b\u5f53\u524d\u8fd0\u884c\u73af\u5883\u7248\u672c<\/h3>\n<p>\u8f93\u5165<\/p>\n<p>musa_version_query<\/p>\n<p>\u5373\u53ef\u67e5\u770b\u5f53\u524d\u8fd0\u884c\u73af\u5883\u7248\u672c<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"1492\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054353-6836a2996f1b2.png\" width=\"741\" \/><\/p>\n<h2>Pytorch\u90e8\u5206<\/h2>\n<h3>\u8f6c\u4e49<\/h3>\n<p>\u6839\u636e\u5b98\u7f51\u4ecb\u7ecd&#xff0c;\u5bf9\u4e8epytorch\u4ee3\u7801&#xff0c;\u53ea\u9700\u8981\u6b63\u786eimport torch_musa\u7684\u62d3\u5c55\u63d2\u4ef6&#xff0c;\u5e76\u4e14\u5c06\u4ee3\u7801\u4e2d\u7684\u6240\u6709cuda-&gt;musa&#xff0c;\u5c06\u6240\u6709\u7684nccl-&gt;mccl\u5373\u53ef\u3002<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"295\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054353-6836a299bd2d9.png\" width=\"1254\" \/><\/p>\n<h3>\u5b9e\u6d4b<\/h3>\n<p>\u4f5c\u8005\u4f7f\u7528\u8c46\u5305\u968f\u673a\u751f\u6210\u4e86\u4e00\u4e2a\u6d4b\u8bd5allreduce\u7684pytorch\u4ee3\u7801&#xff0c;\u4ee3\u7801\u5982\u4e0b&#xff0c;\u5728\u7ecf\u8fc7\u4e0a\u8ff0\u8f6c\u8bd1\u540e\u80fd\u6b63\u5e38\u8fd0\u884c<\/p>\n<p>import os<br \/>\nimport time<br \/>\nimport argparse<br \/>\nimport torch<br \/>\nimport torch_musa<br \/>\nimport torch.distributed as dist<br \/>\nfrom torch.nn.parallel import DistributedDataParallel as DDP<\/p>\n<p>def setup(rank, world_size):<br \/>\n    os.environ[&#039;MASTER_ADDR&#039;] &#061; &#039;localhost&#039;<br \/>\n    os.environ[&#039;MASTER_PORT&#039;] &#061; &#039;12355&#039;<\/p>\n<p>    # \u521d\u59cb\u5316MUSA\u5206\u5e03\u5f0f\u73af\u5883<br \/>\n    dist.init_process_group(&#034;mccl&#034;, rank&#061;rank, world_size&#061;world_size)<br \/>\n    torch.musa.set_device(rank)<\/p>\n<p>def cleanup():<br \/>\n    dist.destroy_process_group()<\/p>\n<p>def run_benchmark(rank, world_size, sizes, num_iters&#061;100, warmup&#061;20):<br \/>\n    setup(rank, world_size)<\/p>\n<p>    for size in sizes:<br \/>\n        # \u521b\u5efa\u968f\u673a\u5f20\u91cf&#xff08;\u4f7f\u7528MUSA\u8bbe\u5907&#xff09;<br \/>\n        tensor &#061; torch.rand(size, device&#061;f&#039;musa:{rank}&#039;)<\/p>\n<p>        # \u9884\u70ed<br \/>\n        for _ in range(warmup):<br \/>\n            dist.all_reduce(tensor)<br \/>\n            torch.musa.synchronize()<\/p>\n<p>        # \u6d4b\u91cf\u65f6\u95f4<br \/>\n        start_time &#061; time.time()<br \/>\n        for _ in range(num_iters):<br \/>\n            dist.all_reduce(tensor)<br \/>\n            torch.musa.synchronize()<br \/>\n        end_time &#061; time.time()<\/p>\n<p>        # \u8ba1\u7b97\u7edf\u8ba1\u4fe1\u606f<br \/>\n        total_time &#061; end_time &#8211; start_time<br \/>\n        avg_time &#061; total_time \/ num_iters<br \/>\n        size_mb &#061; size * 4 \/ (1024 * 1024)  # float32\u662f4\u5b57\u8282<br \/>\n        bandwidth &#061; (size_mb * world_size) \/ avg_time  # MB\/s<\/p>\n<p>        if rank &#061;&#061; 0:<br \/>\n            print(f&#034;\u5f20\u91cf\u5927\u5c0f: {size:,} \u5143\u7d20 ({size_mb:.2f} MB)&#034;)<br \/>\n            print(f&#034;\u5e73\u5747\u8017\u65f6: {avg_time * 1000:.2f} ms&#034;)<br \/>\n            print(f&#034;\u5e26\u5bbd: {bandwidth \/ 1024:.2f} GB\/s&#034;)<br \/>\n            print(&#034;-&#034; * 50)<\/p>\n<p>    cleanup()<\/p>\n<p>def main():<br \/>\n    parser &#061; argparse.ArgumentParser()<br \/>\n    parser.add_argument(&#039;&#8211;sizes&#039;, type&#061;int, nargs&#061;&#039;&#043;&#039;,<br \/>\n                        default&#061;[1000, 10000, 100000, 1000000, 10000000, 100000000],<br \/>\n                        metavar&#061;&#039;N&#039;,<br \/>\n                        help&#061;&#039;\u6d4b\u8bd5\u7684\u5f20\u91cf\u5927\u5c0f\u5217\u8868&#039;)<br \/>\n    parser.add_argument(&#039;&#8211;num-iters&#039;, type&#061;int, default&#061;100,<br \/>\n                        help&#061;&#039;\u6bcf\u4e2a\u5927\u5c0f\u7684\u8fed\u4ee3\u6b21\u6570&#039;)<br \/>\n    parser.add_argument(&#039;&#8211;warmup&#039;, type&#061;int, default&#061;20,<br \/>\n                        help&#061;&#039;\u9884\u70ed\u8fed\u4ee3\u6b21\u6570&#039;)<br \/>\n    args &#061; parser.parse_args()<\/p>\n<p>    world_size &#061; torch.musa.device_count()<br \/>\n    if world_size !&#061; 4:<br \/>\n        raise ValueError(&#034;\u6b64\u811a\u672c\u9700\u89814\u4e2aMUSA GPU&#xff0c;\u4f46\u53d1\u73b0 {} \u4e2aGPU&#034;.format(world_size))<\/p>\n<p>    import torch.multiprocessing as mp<br \/>\n    mp.spawn(run_benchmark,<br \/>\n             args&#061;(world_size, args.sizes, args.num_iters, args.warmup),<br \/>\n             nprocs&#061;world_size,<br \/>\n             join&#061;True)<\/p>\n<p>if __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    main()<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"1065\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054354-6836a29a09e31.png\" width=\"658\" \/><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"687\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054354-6836a29a4ef13.png\" width=\"2116\" \/><\/p>\n<h2>MUSA\u7f16\u7a0b<\/h2>\n<h3>p2p\u901a\u4fe1\u90e8\u5206<\/h3>\n<h4>\u4ee3\u7801\u53c2\u8003<\/h4>\n<p>\u7b14\u8005\u6309\u7167\u82f1\u4f1f\u8fbecudasamples\u4ed3\u5e93\u4e2d\u7684p2pbandwidthtest \u4ee3\u7801&#xff0c;cuda-samples\/Samples\/5_Domain_Specific\/p2pBandwidthLatencyTest at master \u00b7 NVIDIA\/cuda-samples \u00b7 GitHub<\/p>\n<p>\u5e76\u4e14\u53c2\u8003\u76f8\u5e94\u7684musa event api\u4e0emublasapi<\/p>\n<p>https:\/\/docs.mthreads.com\/musa-sdk\/musa-sdk-doc-online\/api\/mcc_um.zh-CN<\/p>\n<p>\u7f16\u5199\u4e86\u4e00\u4e2a\u9002\u7528\u4e8e\u6469\u5c14\u7ebf\u7a0b\u7684p2p\u901a\u4fe1\u68c0\u6d4b\u9a8c\u8bc1\u7a0b\u5e8f<\/p>\n<h4>\u4ee3\u7801\u90e8\u5206<\/h4>\n<p>#include &lt;cstdio&gt;<br \/>\n#include &lt;vector&gt;<br \/>\n#include &lt;musa_runtime.h&gt;  \/\/ \u5047\u8bbe MUSA \u5934\u6587\u4ef6<\/p>\n<p>using namespace std;<\/p>\n<p>const char *sSampleName &#061; &#034;P2P (Peer-to-Peer) GPU Bandwidth Latency Test&#034;;<\/p>\n<p>typedef enum {<br \/>\n    P2P_WRITE &#061; 0,<br \/>\n    P2P_READ  &#061; 1,<br \/>\n} P2PDataTransfer;<\/p>\n<p>typedef enum {<br \/>\n    CE &#061; 0,<br \/>\n    SM &#061; 1,<br \/>\n} P2PEngine;<\/p>\n<p>P2PEngine p2p_mechanism &#061; CE; \/\/ \u9ed8\u8ba4\u4f7f\u7528 Copy Engine<\/p>\n<p>\/\/ \u9519\u8bef\u68c0\u67e5\u5b8f<br \/>\n#define musaCheckError()                                                              \\\\<br \/>\n    {                                                                                   \\\\<br \/>\n        musaError_t e &#061; musaGetLastError();                                             \\\\<br \/>\n        if (e !&#061; musaSuccess) {                                                         \\\\<br \/>\n            printf(&#034;MUSA failure %s:%d: &#039;%s&#039;\\\\n&#034;, __FILE__, __LINE__, musaGetErrorString(e)); \\\\<br \/>\n            exit(EXIT_FAILURE);                                                         \\\\<br \/>\n        }                                                                               \\\\<br \/>\n    }<\/p>\n<p>\/\/ \u5ef6\u8fdf\u5185\u6838<br \/>\n__global__ void delay(volatile int *flag, unsigned long long timeout_clocks &#061; 10000000)<br \/>\n{<br \/>\n    \/\/ \u7b49\u5f85\u5e94\u7528\u7a0b\u5e8f\u901a\u77e5\u6211\u4eec\u5b83\u5df2\u7ecf\u5b8c\u6210\u4e86\u5b9e\u9a8c\u7684\u6392\u961f&#xff0c;\u6216\u8005\u8d85\u65f6\u5e76\u9000\u51fa&#xff0c;\u5141\u8bb8\u5e94\u7528\u7a0b\u5e8f\u7ee7\u7eed\u6267\u884c<br \/>\n    long long int start_clock, sample_clock;<br \/>\n    start_clock &#061; clock64();<\/p>\n<p>    while (!*flag) {<br \/>\n        sample_clock &#061; clock64();<\/p>\n<p>        if (sample_clock &#8211; start_clock &gt; timeout_clocks) {<br \/>\n            break;<br \/>\n        }<br \/>\n    }<br \/>\n}<\/p>\n<p>\/\/ P2P \u590d\u5236\u5185\u6838<br \/>\n__global__ void copyp2p(int4 *__restrict__ dest, const int4 *__restrict__ src, size_t num_elems) {<br \/>\n    size_t globalId &#061; blockIdx.x * blockDim.x &#043; threadIdx.x;<br \/>\n    size_t gridSize &#061; blockDim.x * gridDim.x;<\/p>\n<p>#pragma unroll 5 \/\/ \u79fb\u9664\u62ec\u53f7<br \/>\n    for (size_t i &#061; globalId; i &lt; num_elems; i &#043;&#061; gridSize) {<br \/>\n        dest[i] &#061; src[i];<br \/>\n    }<br \/>\n}<\/p>\n<p>\/\/ \u6253\u5370\u5e2e\u52a9\u4fe1\u606f<br \/>\nvoid printHelp(void) {<br \/>\n    printf(&#034;Usage:  p2pBandwidthLatencyTest [OPTION]&#8230;\\\\n&#034;);<br \/>\n    printf(&#034;Tests bandwidth\/latency of GPU pairs using P2P and without P2P\\\\n&#034;);<br \/>\n    printf(&#034;\\\\n&#034;);<br \/>\n    printf(&#034;Options:\\\\n&#034;);<br \/>\n    printf(&#034;&#8211;help\\\\t\\\\tDisplay this help menu\\\\n&#034;);<br \/>\n    printf(&#034;&#8211;p2p_read\\\\tUse P2P reads for data transfers between GPU pairs\\\\n&#034;);<br \/>\n    printf(&#034;&#8211;sm_copy\\\\tUse SM intiated p2p transfers instead of Copy Engine\\\\n&#034;);<br \/>\n    printf(&#034;&#8211;numElems&#061;&lt;NUM_OF_INT_ELEMS&gt;  Number of integer elements for p2p copy\\\\n&#034;);<br \/>\n}<\/p>\n<p>\/\/ \u68c0\u67e5P2P\u8bbf\u95ee<br \/>\nvoid checkP2Paccess(int numGPUs) {<br \/>\n    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        musaSetDevice(i);<br \/>\n        musaCheckError();<\/p>\n<p>        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            if (i !&#061; j) {<br \/>\n                int access;<br \/>\n                musaDeviceCanAccessPeer(&amp;access, i, j);<br \/>\n                musaCheckError();<br \/>\n                printf(&#034;Device&#061;%d %s Access Peer Device&#061;%d\\\\n&#034;, i, access ? &#034;CAN&#034; : &#034;CANNOT&#034;, j);<br \/>\n            }<br \/>\n        }<br \/>\n    }<br \/>\n    printf(&#034;\\\\n***NOTE: Devices without P2P access fall back to normal memcpy.\\\\n&#034;);<br \/>\n}<\/p>\n<p>\/\/ \u6267\u884cP2P\u590d\u5236<br \/>\nvoid performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,<br \/>\n                    size_t num_elems, int repeat, bool p2paccess,<br \/>\n                    musaStream_t streamToRun) {<br \/>\n    int blockSize, numBlocks;<br \/>\n    musaOccupancyMaxPotentialBlockSize(&amp;numBlocks, &amp;blockSize, copyp2p);<br \/>\n    musaCheckError();<\/p>\n<p>    if (p2p_mechanism &#061;&#061; SM &amp;&amp; p2paccess) {<br \/>\n        for (int r &#061; 0; r &lt; repeat; r&#043;&#043;) {<br \/>\n            copyp2p&lt;&lt;&lt;numBlocks, blockSize, 0, streamToRun&gt;&gt;&gt;((int4*)dest, (int4*)src, num_elems\/4);<br \/>\n        }<br \/>\n    } else {<br \/>\n        for (int r &#061; 0; r &lt; repeat; r&#043;&#043;) {<br \/>\n            musaMemcpyPeerAsync(dest, destDevice, src, srcDevice,<br \/>\n                               sizeof(int)*num_elems, streamToRun);<br \/>\n            musaCheckError();<br \/>\n        }<br \/>\n    }<br \/>\n}<\/p>\n<p>\/\/ \u8f93\u51fa\u5e26\u5bbd\u77e9\u9635<br \/>\nvoid outputBandwidthMatrix(int numElems, int numGPUs, bool p2p, P2PDataTransfer p2p_method) {<br \/>\n    int repeat &#061; 5;<br \/>\n    volatile int *flag &#061; NULL;<br \/>\n    vector&lt;int *&gt; buffers(numGPUs);<br \/>\n    vector&lt;int *&gt; buffersD2D(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; start(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; stop(numGPUs);<br \/>\n    vector&lt;musaStream_t&gt; stream(numGPUs);<\/p>\n<p>    musaHostAlloc((void **)&amp;flag, sizeof(*flag), musaHostAllocPortable);<br \/>\n    musaCheckError();<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaStreamCreateWithFlags(&amp;stream[d], musaStreamNonBlocking);<br \/>\n        musaMalloc(&amp;buffers[d], numElems * sizeof(int));<br \/>\n        musaMemset(buffers[d], 0, numElems * sizeof(int));<br \/>\n        musaMalloc(&amp;buffersD2D[d], numElems * sizeof(int));<br \/>\n        musaMemset(buffersD2D[d], 0, numElems * sizeof(int));<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;stop[d]);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    vector&lt;double&gt; bandwidthMatrix(numGPUs * numGPUs);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        musaSetDevice(i);<\/p>\n<p>        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            int access &#061; 0;<br \/>\n            if (p2p) {<br \/>\n                musaDeviceCanAccessPeer(&amp;access, i, j);<br \/>\n                if (access) {<br \/>\n                    musaDeviceEnablePeerAccess(j, 0);<br \/>\n                    musaCheckError();<br \/>\n                    musaSetDevice(j);<br \/>\n                    musaDeviceEnablePeerAccess(i, 0);<br \/>\n                    musaCheckError();<br \/>\n                    musaSetDevice(i);<br \/>\n                    musaCheckError();<br \/>\n                }<br \/>\n            }<\/p>\n<p>            musaStreamSynchronize(stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            \/\/ \u963b\u585e\u6d41&#xff0c;\u76f4\u5230\u6240\u6709\u5de5\u4f5c\u6392\u961f\u5b8c\u6210<br \/>\n            *flag &#061; 0;<br \/>\n            delay&lt;&lt;&lt;1, 1, 0, stream[i]&gt;&gt;&gt;(flag);<br \/>\n            musaCheckError();<br \/>\n            musaEventRecord(start[i], stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            if (i &#061;&#061; j) {<br \/>\n                performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream[i]);<br \/>\n            }<br \/>\n            else {<br \/>\n                if (p2p_method &#061;&#061; P2P_WRITE) {<br \/>\n                    performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream[i]);<br \/>\n                }<br \/>\n                else {<br \/>\n                    performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream[i]);<br \/>\n                }<br \/>\n            }<\/p>\n<p>            musaEventRecord(stop[i], stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            \/\/ \u91ca\u653e\u6392\u961f\u7684\u4e8b\u4ef6<br \/>\n            *flag &#061; 1;<br \/>\n            musaStreamSynchronize(stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            float time_ms;<br \/>\n            musaEventElapsedTime(&amp;time_ms, start[i], stop[i]);<br \/>\n            double time_s &#061; time_ms \/ 1e3;<\/p>\n<p>            double gb &#061; numElems * sizeof(int) * repeat \/ (double)1e9;<br \/>\n            if (i &#061;&#061; j) {<br \/>\n                gb *&#061; 2;<br \/>\n            }<br \/>\n            bandwidthMatrix[i * numGPUs &#043; j] &#061; gb \/ time_s;<br \/>\n            if (p2p &amp;&amp; access) {<br \/>\n                musaDeviceDisablePeerAccess(j);<br \/>\n                musaSetDevice(j);<br \/>\n                musaDeviceDisablePeerAccess(i);<br \/>\n                musaSetDevice(i);<br \/>\n                musaCheckError();<br \/>\n            }<br \/>\n        }<br \/>\n    }<\/p>\n<p>    printf(&#034;   D\\\\\\\\D&#034;);<br \/>\n    for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, j);<br \/>\n    }<br \/>\n    printf(&#034;\\\\n&#034;);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, i);<br \/>\n        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            printf(&#034;%6.02f &#034;, bandwidthMatrix[i * numGPUs &#043; j]);<br \/>\n        }<br \/>\n        printf(&#034;\\\\n&#034;);<br \/>\n    }<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaFree(buffers[d]);<br \/>\n        musaFree(buffersD2D[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(stop[d]);<br \/>\n        musaCheckError();<br \/>\n        musaStreamDestroy(stream[d]);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    musaFreeHost((void *)flag);<br \/>\n    musaCheckError();<br \/>\n}<\/p>\n<p>\/\/ \u8f93\u51fa\u53cc\u5411\u5e26\u5bbd\u77e9\u9635<br \/>\nvoid outputBidirectionalBandwidthMatrix(int numElems, int numGPUs, bool p2p) {<br \/>\n    int repeat &#061; 5;<br \/>\n    volatile int *flag &#061; NULL;<br \/>\n    vector&lt;int *&gt; buffers(numGPUs);<br \/>\n    vector&lt;int *&gt; buffersD2D(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; start(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; stop(numGPUs);<br \/>\n    vector&lt;musaStream_t&gt; stream0(numGPUs);<br \/>\n    vector&lt;musaStream_t&gt; stream1(numGPUs);<\/p>\n<p>    musaHostAlloc((void **)&amp;flag, sizeof(*flag), musaHostAllocPortable);<br \/>\n    musaCheckError();<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaMalloc(&amp;buffers[d], numElems * sizeof(int));<br \/>\n        musaMemset(buffers[d], 0, numElems * sizeof(int));<br \/>\n        musaMalloc(&amp;buffersD2D[d], numElems * sizeof(int));<br \/>\n        musaMemset(buffersD2D[d], 0, numElems * sizeof(int));<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;stop[d]);<br \/>\n        musaCheckError();<br \/>\n        musaStreamCreateWithFlags(&amp;stream0[d], musaStreamNonBlocking);<br \/>\n        musaCheckError();<br \/>\n        musaStreamCreateWithFlags(&amp;stream1[d], musaStreamNonBlocking);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    vector&lt;double&gt; bandwidthMatrix(numGPUs * numGPUs);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        musaSetDevice(i);<\/p>\n<p>        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            int access &#061; 0;<br \/>\n            if (p2p) {<br \/>\n                musaDeviceCanAccessPeer(&amp;access, i, j);<br \/>\n                if (access) {<br \/>\n                    musaSetDevice(i);<br \/>\n                    musaDeviceEnablePeerAccess(j, 0);<br \/>\n                    musaCheckError();<br \/>\n                    musaSetDevice(j);<br \/>\n                    musaDeviceEnablePeerAccess(i, 0);<br \/>\n                    musaCheckError();<br \/>\n                }<br \/>\n            }<\/p>\n<p>            musaSetDevice(i);<br \/>\n            musaStreamSynchronize(stream0[i]);<br \/>\n            musaStreamSynchronize(stream1[j]);<br \/>\n            musaCheckError();<\/p>\n<p>            \/\/ \u963b\u585e\u6d41&#xff0c;\u76f4\u5230\u6240\u6709\u5de5\u4f5c\u6392\u961f\u5b8c\u6210<br \/>\n            *flag &#061; 0;<br \/>\n            musaSetDevice(i);<br \/>\n            \/\/ \u65e0\u9700\u963b\u585e stream1&#xff0c;\u56e0\u4e3a\u5b83\u4f1a\u5728 stream0 \u7684\u4e8b\u4ef6\u4e0a\u963b\u585e<br \/>\n            delay&lt;&lt;&lt;1, 1, 0, stream0[i]&gt;&gt;&gt;(flag);<br \/>\n            musaCheckError();<\/p>\n<p>            \/\/ \u5f3a\u5236 stream1 \u5728 stream0 \u5f00\u59cb\u4e4b\u524d\u4e0d\u542f\u52a8&#xff0c;\u4ee5\u786e\u4fdd stream0 \u4e0a\u7684\u4e8b\u4ef6\u5b8c\u5168\u6db5\u76d6\u6240\u6709\u64cd\u4f5c\u6240\u9700\u7684\u65f6\u95f4<br \/>\n            musaEventRecord(start[i], stream0[i]);<br \/>\n            musaStreamWaitEvent(stream1[j], start[i], 0);<\/p>\n<p>            if (i &#061;&#061; j) {<br \/>\n                \/\/ \u5bf9\u4e8e GPU \u5185\u64cd\u4f5c&#xff0c;\u6267\u884c 2 \u6b21\u5185\u5b58\u590d\u5236 buffersD2D &lt;-&gt; buffers<br \/>\n                performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream0[i]);<br \/>\n                performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat, access, stream1[i]);<br \/>\n            }<br \/>\n            else {<br \/>\n                if (access &amp;&amp; p2p_mechanism &#061;&#061; SM) {<br \/>\n                    musaSetDevice(j);<br \/>\n                }<br \/>\n                performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream1[j]);<br \/>\n                if (access &amp;&amp; p2p_mechanism &#061;&#061; SM) {<br \/>\n                    musaSetDevice(i);<br \/>\n                }<br \/>\n                performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream0[i]);<br \/>\n            }<\/p>\n<p>            \/\/ \u901a\u77e5 stream0 stream1 \u5df2\u5b8c\u6210&#xff0c;\u5e76\u8bb0\u5f55\u603b\u4e8b\u52a1\u7684\u65f6\u95f4<br \/>\n            musaEventRecord(stop[j], stream1[j]);<br \/>\n            musaStreamWaitEvent(stream0[i], stop[j], 0);<br \/>\n            musaEventRecord(stop[i], stream0[i]);<\/p>\n<p>            \/\/ \u91ca\u653e\u6392\u961f\u7684\u64cd\u4f5c<br \/>\n            *flag &#061; 1;<br \/>\n            musaStreamSynchronize(stream0[i]);<br \/>\n            musaStreamSynchronize(stream1[j]);<br \/>\n            musaCheckError();<\/p>\n<p>            float time_ms;<br \/>\n            musaEventElapsedTime(&amp;time_ms, start[i], stop[i]);<br \/>\n            double time_s &#061; time_ms \/ 1e3;<\/p>\n<p>            double gb &#061; 2.0 * numElems * sizeof(int) * repeat \/ (double)1e9;<br \/>\n            if (i &#061;&#061; j) {<br \/>\n                gb *&#061; 2;<br \/>\n            }<br \/>\n            bandwidthMatrix[i * numGPUs &#043; j] &#061; gb \/ time_s;<br \/>\n            if (p2p &amp;&amp; access) {<br \/>\n                musaSetDevice(i);<br \/>\n                musaDeviceDisablePeerAccess(j);<br \/>\n                musaSetDevice(j);<br \/>\n                musaDeviceDisablePeerAccess(i);<br \/>\n            }<br \/>\n        }<br \/>\n    }<\/p>\n<p>    printf(&#034;   D\\\\\\\\D&#034;);<br \/>\n    for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, j);<br \/>\n    }<br \/>\n    printf(&#034;\\\\n&#034;);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, i);<br \/>\n        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            printf(&#034;%6.02f &#034;, bandwidthMatrix[i * numGPUs &#043; j]);<br \/>\n        }<br \/>\n        printf(&#034;\\\\n&#034;);<br \/>\n    }<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaFree(buffers[d]);<br \/>\n        musaFree(buffersD2D[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(stop[d]);<br \/>\n        musaCheckError();<br \/>\n        musaStreamDestroy(stream0[d]);<br \/>\n        musaCheckError();<br \/>\n        musaStreamDestroy(stream1[d]);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    musaFreeHost((void *)flag);<br \/>\n    musaCheckError();<br \/>\n}<\/p>\n<p>\/\/ \u8f93\u51fa\u5ef6\u8fdf\u77e9\u9635<br \/>\nvoid outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {<br \/>\n    int repeat &#061; 100;<br \/>\n    int numElems &#061; 4; \/\/ \u6267\u884c 1 \u4e2a int4 \u4f20\u8f93<br \/>\n    volatile int *flag &#061; NULL;<br \/>\n    vector&lt;int *&gt; buffers(numGPUs);<br \/>\n    vector&lt;int *&gt; buffersD2D(numGPUs); \/\/ \u7528\u4e8e D2D&#xff08;\u5373 GPU \u5185\u590d\u5236&#xff09;\u7684\u7f13\u51b2\u533a<br \/>\n    vector&lt;musaStream_t&gt; stream(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; start(numGPUs);<br \/>\n    vector&lt;musaEvent_t&gt; stop(numGPUs);<\/p>\n<p>    musaHostAlloc((void **)&amp;flag, sizeof(*flag), musaHostAllocPortable);<br \/>\n    musaCheckError();<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaStreamCreateWithFlags(&amp;stream[d], musaStreamNonBlocking);<br \/>\n        musaMalloc(&amp;buffers[d], sizeof(int) * numElems);<br \/>\n        musaMemset(buffers[d], 0, sizeof(int) * numElems);<br \/>\n        musaMalloc(&amp;buffersD2D[d], sizeof(int) * numElems);<br \/>\n        musaMemset(buffersD2D[d], 0, sizeof(int) * numElems);<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventCreate(&amp;stop[d]);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    vector&lt;double&gt; gpuLatencyMatrix(numGPUs * numGPUs);<br \/>\n    vector&lt;double&gt; cpuLatencyMatrix(numGPUs * numGPUs);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        musaSetDevice(i);<\/p>\n<p>        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            int access &#061; 0;<br \/>\n            if (p2p) {<br \/>\n                musaDeviceCanAccessPeer(&amp;access, i, j);<br \/>\n                if (access) {<br \/>\n                    musaDeviceEnablePeerAccess(j, 0);<br \/>\n                    musaCheckError();<br \/>\n                    musaSetDevice(j);<br \/>\n                    musaDeviceEnablePeerAccess(i, 0);<br \/>\n                    musaSetDevice(i);<br \/>\n                    musaCheckError();<br \/>\n                }<br \/>\n            }<br \/>\n            musaStreamSynchronize(stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            \/\/ \u963b\u585e\u6d41&#xff0c;\u76f4\u5230\u6240\u6709\u5de5\u4f5c\u6392\u961f\u5b8c\u6210<br \/>\n            *flag &#061; 0;<br \/>\n            delay&lt;&lt;&lt;1, 1, 0, stream[i]&gt;&gt;&gt;(flag);<br \/>\n            musaCheckError();<br \/>\n            musaEventRecord(start[i], stream[i]);<\/p>\n<p>            if (i &#061;&#061; j) {<br \/>\n                \/\/ \u6267\u884c GPU \u5185\u7684 D2D \u590d\u5236<br \/>\n                performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream[i]);<br \/>\n            }<br \/>\n            else {<br \/>\n                if (p2p_method &#061;&#061; P2P_WRITE) {<br \/>\n                    performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream[i]);<br \/>\n                }<br \/>\n                else {<br \/>\n                    performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream[i]);<br \/>\n                }<br \/>\n            }<\/p>\n<p>            musaEventRecord(stop[i], stream[i]);<br \/>\n            \/\/ \u73b0\u5728\u5de5\u4f5c\u5df2\u7ecf\u6392\u961f\u5b8c\u6210&#xff0c;\u91ca\u653e\u6d41<br \/>\n            *flag &#061; 1;<br \/>\n            musaStreamSynchronize(stream[i]);<br \/>\n            musaCheckError();<\/p>\n<p>            float gpu_time_ms;<br \/>\n            musaEventElapsedTime(&amp;gpu_time_ms, start[i], stop[i]);<\/p>\n<p>            gpuLatencyMatrix[i * numGPUs &#043; j] &#061; gpu_time_ms * 1e3 \/ repeat;<br \/>\n            if (p2p &amp;&amp; access) {<br \/>\n                musaDeviceDisablePeerAccess(j);<br \/>\n                musaSetDevice(j);<br \/>\n                musaDeviceDisablePeerAccess(i);<br \/>\n                musaSetDevice(i);<br \/>\n                musaCheckError();<br \/>\n            }<br \/>\n        }<br \/>\n    }<\/p>\n<p>    printf(&#034;   GPU&#034;);<br \/>\n    for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, j);<br \/>\n    }<br \/>\n    printf(&#034;\\\\n&#034;);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        printf(&#034;%6d &#034;, i);<br \/>\n        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            printf(&#034;%6.02f &#034;, gpuLatencyMatrix[i * numGPUs &#043; j]);<br \/>\n        }<br \/>\n        printf(&#034;\\\\n&#034;);<br \/>\n    }<\/p>\n<p>    for (int d &#061; 0; d &lt; numGPUs; d&#043;&#043;) {<br \/>\n        musaSetDevice(d);<br \/>\n        musaFree(buffers[d]);<br \/>\n        musaFree(buffersD2D[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(start[d]);<br \/>\n        musaCheckError();<br \/>\n        musaEventDestroy(stop[d]);<br \/>\n        musaCheckError();<br \/>\n        musaStreamDestroy(stream[d]);<br \/>\n        musaCheckError();<br \/>\n    }<\/p>\n<p>    musaFreeHost((void *)flag);<br \/>\n    musaCheckError();<br \/>\n}<\/p>\n<p>\/\/ \u4e3b\u51fd\u6570<br \/>\nint main(int argc, char **argv) {<br \/>\n    int numGPUs, numElems &#061; 40000000;<br \/>\n    P2PDataTransfer p2p_method &#061; P2P_WRITE;<\/p>\n<p>    musaGetDeviceCount(&amp;numGPUs);<br \/>\n    musaCheckError();<\/p>\n<p>    \/\/ \u5904\u7406\u547d\u4ee4\u884c\u53c2\u6570<br \/>\n    for (int i &#061; 1; i &lt; argc; i&#043;&#043;) {<br \/>\n        if (strcmp(argv[i], &#034;&#8211;help&#034;) &#061;&#061; 0) {<br \/>\n            printHelp();<br \/>\n            return 0;<br \/>\n        } else if (strcmp(argv[i], &#034;&#8211;p2p_read&#034;) &#061;&#061; 0) {<br \/>\n            p2p_method &#061; P2P_READ;<br \/>\n        } else if (strcmp(argv[i], &#034;&#8211;sm_copy&#034;) &#061;&#061; 0) {<br \/>\n            p2p_mechanism &#061; SM;<br \/>\n        } else if (strncmp(argv[i], &#034;&#8211;numElems&#061;&#034;, 11) &#061;&#061; 0) {<br \/>\n            numElems &#061; atoi(argv[i] &#043; 11);<br \/>\n        }<br \/>\n    }<\/p>\n<p>    printf(&#034;[%s]\\\\n&#034;, sSampleName);<\/p>\n<p>    \/\/ \u8f93\u51fa\u8bbe\u5907\u4fe1\u606f<br \/>\n    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        musaDeviceProp prop;<br \/>\n        musaGetDeviceProperties(&amp;prop, i);<br \/>\n        printf(&#034;Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\\\\n&#034;,<br \/>\n               i, prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);<br \/>\n    }<\/p>\n<p>    checkP2Paccess(numGPUs);<\/p>\n<p>    \/\/ \u8f93\u51faP2P\u8fde\u63a5\u77e9\u9635<br \/>\n    printf(&#034;P2P Connectivity Matrix\\\\n&#034;);<br \/>\n    printf(&#034;     D\\\\\\\\D&#034;);<br \/>\n    for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n        printf(&#034;%6d&#034;, j);<br \/>\n    }<br \/>\n    printf(&#034;\\\\n&#034;);<\/p>\n<p>    for (int i &#061; 0; i &lt; numGPUs; i&#043;&#043;) {<br \/>\n        printf(&#034;%6d\\\\t&#034;, i);<br \/>\n        for (int j &#061; 0; j &lt; numGPUs; j&#043;&#043;) {<br \/>\n            if (i !&#061; j) {<br \/>\n                int access;<br \/>\n                musaDeviceCanAccessPeer(&amp;access, i, j);<br \/>\n                printf(&#034;%6d&#034;, (access) ? 1 : 0);<br \/>\n            } else {<br \/>\n                printf(&#034;%6d&#034;, 1);<br \/>\n            }<br \/>\n        }<br \/>\n        printf(&#034;\\\\n&#034;);<br \/>\n    }<\/p>\n<p>    \/\/ \u8f93\u51fa\u5404\u79cd\u6d4b\u8bd5\u7ed3\u679c<br \/>\n    printf(&#034;Unidirectional P2P&#061;Disabled Bandwidth Matrix (GB\/s)\\\\n&#034;);<br \/>\n    outputBandwidthMatrix(numElems, numGPUs, false, P2P_WRITE);<br \/>\n    printf(&#034;Unidirectional P2P&#061;Enabled Bandwidth (P2P Writes) Matrix (GB\/s)\\\\n&#034;);<br \/>\n    outputBandwidthMatrix(numElems, numGPUs, true, P2P_WRITE);<br \/>\n    if (p2p_method &#061;&#061; P2P_READ) {<br \/>\n        printf(&#034;Unidirectional P2P&#061;Enabled Bandwidth (P2P Reads) Matrix (GB\/s)\\\\n&#034;);<br \/>\n        outputBandwidthMatrix(numElems, numGPUs, true, p2p_method);<br \/>\n    }<br \/>\n    printf(&#034;Bidirectional P2P&#061;Disabled Bandwidth Matrix (GB\/s)\\\\n&#034;);<br \/>\n    outputBidirectionalBandwidthMatrix(numElems, numGPUs, false);<br \/>\n    printf(&#034;Bidirectional P2P&#061;Enabled Bandwidth Matrix (GB\/s)\\\\n&#034;);<br \/>\n    outputBidirectionalBandwidthMatrix(numElems, numGPUs, true);<\/p>\n<p>    printf(&#034;P2P&#061;Disabled Latency Matrix (us)\\\\n&#034;);<br \/>\n    outputLatencyMatrix(numGPUs, false, P2P_WRITE);<br \/>\n    printf(&#034;P2P&#061;Enabled Latency (P2P Writes) Matrix (us)\\\\n&#034;);<br \/>\n    outputLatencyMatrix(numGPUs, true, P2P_WRITE);<br \/>\n    if (p2p_method &#061;&#061; P2P_READ) {<br \/>\n        printf(&#034;P2P&#061;Enabled Latency (P2P Reads) Matrix (us)\\\\n&#034;);<br \/>\n        outputLatencyMatrix(numGPUs, true, p2p_method);<br \/>\n    }<\/p>\n<p>    printf(&#034;\\\\nNOTE: Results may vary when GPU Boost is enabled.\\\\n&#034;);<\/p>\n<p>    return 0;<br \/>\n}<\/p>\n<h4>\u7f16\u8bd1<\/h4>\n<p>\u53c2\u8003mcc\u7f16\u8bd1\u624b\u518c&#xff0c;\u6b64\u65f6\u4ee3\u7801\u4e2d\u5f15\u7528\u7684\u5e93\u4e3amusa_runtime,\u5219\u7f16\u8bd1\u662f-l\u53c2\u6570\u540e\u8ddfmusart<\/p>\n<p>mcc p2p.mu -o p2p -lmusart<\/p>\n<h4>\u7ed3\u679c<\/h4>\n<p>\u53ef\u4ee5\u770b\u5230p2p\u5df2\u7ecf\u6b63\u786e\u5f00\u542f&#xff0c;\u4f46\u662f\u5ef6\u8fdf\u6d4b\u8bd5\u6709\u95ee\u9898&#xff0c;\u540e\u7eed\u6539\u8fdb<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"1268\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054355-6836a29b0b678.png\" width=\"684\" \/><\/p>\n<h3>\u57fa\u4e8emusa\u7f16\u7a0b\u7684allreduce\u6d4b\u8bd5<\/h3>\n<h4>\u4ee3\u7801\u53c2\u8003<\/h4>\n<p>\u4e3b\u8981\u53c2\u8003\u4e86NCCLtest\u4e2d\u7684allreduce\u90e8\u5206\u903b\u8f91<\/p>\n<p>GitHub &#8211; NVIDIA\/nccl-tests: NCCL Tests<\/p>\n<p>\u5e76\u4e14\u53c2\u8003\u4e86mublas api\u8bbe\u8ba1 https:\/\/docs.mthreads.com\/musa-sdk\/musa-sdk-doc-online\/api\/mublas_api<\/p>\n<h4>\u4ee3\u7801\u90e8\u5206<\/h4>\n<p>#include &lt;stdio.h&gt;<br \/>\n#include &lt;stdlib.h&gt;<br \/>\n#include &lt;math.h&gt;<br \/>\n#include &#034;musa_runtime.h&#034;<br \/>\n#include &#034;mccl.h&#034;<br \/>\n#include &lt;inttypes.h&gt; \/\/ \u5fc5\u987b\u5305\u542b\u6b64\u5934\u6587\u4ef6<\/p>\n<p>\/\/ \u5b8f\u5b9a\u4e49&#xff08;\u6240\u6709\u6807\u8bc6\u7b26\u5728\u6b64\u5904\u58f0\u660e&#xff09;<br \/>\n#define MIN_SIZE_B       16ULL           \/\/ \u6700\u5c0f\u6d4b\u8bd5\u5c3a\u5bf8&#xff08;16\u5b57\u8282&#xff09;<br \/>\n#define MAX_SIZE_B  (4096ULL * 1024ULL * 1024ULL)  \/\/ \u6700\u5927\u6d4b\u8bd5\u5c3a\u5bf8&#xff08;4096MB&#xff09;<br \/>\n#define STEP_FACTOR      2ULL           \/\/ \u5c3a\u5bf8\u589e\u957f\u56e0\u5b50&#xff08;\u6bcf\u6b21\u7ffb\u500d&#xff09;<br \/>\n#define WARMUP_ITERS       5             \/\/ \u70ed\u8eab\u8fed\u4ee3\u6b21\u6570<br \/>\n#define TEST_ITERS        20             \/\/ \u6d4b\u8bd5\u8fed\u4ee3\u6b21\u6570<br \/>\n#define ROOT_RANK        -1             \/\/ \u6839\u8282\u70b9&#xff08;-1\u8868\u793a\u5168\u5f52\u7ea6&#xff09;<br \/>\n#define DATA_TYPE         mcclFloat      \/\/ \u6570\u636e\u7c7b\u578b<br \/>\n#define REDUCTION_OP      mcclSum        \/\/ \u5f52\u7ea6\u64cd\u4f5c<br \/>\n#define FLOAT_SIZE        sizeof(float)  \/\/ float\u7c7b\u578b\u5b57\u8282\u6570&#xff08;4\u5b57\u8282&#xff09;<\/p>\n<p>\/\/ \u9519\u8bef\u68c0\u67e5\u5b8f<br \/>\n#define MUSACHECK(cmd) do { \\\\<br \/>\n    musaError_t err &#061; cmd; \\\\<br \/>\n    if (err !&#061; musaSuccess) { \\\\<br \/>\n        printf(&#034;MUSA Error at %s:%d: %s\\\\n&#034;, __FILE__, __LINE__, musaGetErrorString(err)); \\\\<br \/>\n        exit(EXIT_FAILURE); \\\\<br \/>\n    } \\\\<br \/>\n} while(0)<\/p>\n<p>#define MCCLCHECK(cmd) do { \\\\<br \/>\n    mcclResult_t res &#061; cmd; \\\\<br \/>\n    if (res !&#061; mcclSuccess) { \\\\<br \/>\n        printf(&#034;MCCL Error at %s:%d: %s\\\\n&#034;, __FILE__, __LINE__, mcclGetErrorString(res)); \\\\<br \/>\n        exit(EXIT_FAILURE); \\\\<br \/>\n    } \\\\<br \/>\n} while(0)<\/p>\n<p>\/\/ \u5e26\u5bbd\u8ba1\u7b97\u51fd\u6570<br \/>\nvoid calculate_bandwidth(size_t count, int type_size, double time_sec, double* alg_bw, double* bus_bw, int nranks) {<br \/>\n    if (time_sec &lt;&#061; 0 || count &#061;&#061; 0) {<br \/>\n        *alg_bw &#061; 0.0;<br \/>\n        *bus_bw &#061; 0.0;<br \/>\n        return;<br \/>\n    }<br \/>\n    double data_size_gb &#061; (double)(count * type_size) \/ 1e9;<br \/>\n    *alg_bw &#061; data_size_gb \/ time_sec;<br \/>\n    double factor &#061; (nranks &gt; 1) ? (2.0 * (nranks &#8211; 1)) \/ nranks : 1.0;<br \/>\n    *bus_bw &#061; *alg_bw * factor;<br \/>\n}<\/p>\n<p>int main(int argc, char* argv[]) {<br \/>\n    int nDev &#061; 4;                  \/\/ \u8bbe\u5907\u6570\u91cf<br \/>\n    int devs[4] &#061; {0, 1, 2, 3};     \/\/ \u8bbe\u5907ID\u5217\u8868<br \/>\n    mcclComm_t comms[4];           \/\/ MCCL\u901a\u4fe1\u5668<br \/>\n    musaStream_t streams[4];       \/\/ \u6d41\u6570\u7ec4<br \/>\n    float** sendbuff &#061; NULL;       \/\/ \u53d1\u9001\u7f13\u51b2\u533a<br \/>\n    float** recvbuff &#061; NULL;       \/\/ \u63a5\u6536\u7f13\u51b2\u533a<br \/>\n    size_t current_size_b &#061; MIN_SIZE_B;  \/\/ \u5f53\u524d\u6d4b\u8bd5\u5c3a\u5bf8&#xff08;\u5b57\u8282&#xff09;<br \/>\n    double alg_bw, bus_bw;          \/\/ \u7b97\u6cd5\u5e26\u5bbd\u548c\u603b\u7ebf\u5e26\u5bbd<br \/>\n    int test_wrong &#061; 0;             \/\/ \u9519\u8bef\u8ba1\u6570<\/p>\n<p>    \/\/ \u521d\u59cb\u5316MCCL\u901a\u4fe1\u5668<br \/>\n    MCCLCHECK(mcclCommInitAll(comms, nDev, devs));<\/p>\n<p>    \/\/ \u5206\u914d\u8bbe\u5907\u5185\u5b58\u5e76\u521b\u5efa\u6d41<br \/>\n    sendbuff &#061; (float**)malloc(nDev * sizeof(float*));<br \/>\n    recvbuff &#061; (float**)malloc(nDev * sizeof(float*));<br \/>\n    for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n        MUSACHECK(musaSetDevice(i));<br \/>\n        MUSACHECK(musaMalloc(&amp;sendbuff[i], MAX_SIZE_B));        \/\/ \u5206\u914d\u6700\u5927\u5c3a\u5bf8\u5185\u5b58<br \/>\n        MUSACHECK(musaMalloc(&amp;recvbuff[i], MAX_SIZE_B));<br \/>\n        MUSACHECK(musaStreamCreate(&amp;streams[i]));               \/\/ \u521b\u5efa\u72ec\u7acb\u6d41<br \/>\n    }<\/p>\n<p>    \/\/ \u6253\u5370\u7ed3\u679c\u8868\u5934<br \/>\n    printf(&#034;| %10s | %10s | %5s | %4s | %14s | %13s | %13s | %13s | %5s |\\\\n&#034;,<br \/>\n       &#034;size (B)&#034;, &#034;count&#034;, &#034;type&#034;, &#034;root&#034;, &#034;warmup_time (us)&#034;, &#034;test_time (us)&#034;, &#034;alg_bw (GB\/s)&#034;, &#034;bus_bw (GB\/s)&#034;, &#034;#wrong&#034;);<br \/>\n    printf(&#034;|&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;-|&#8212;&#8212;|&#8212;&#8212;&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;&#8212;&#8212;-|&#8212;&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;&#8211;|\\\\n&#034;);<br \/>\n    \/\/ \u5c3a\u5bf8\u5faa\u73af\u6d4b\u8bd5<br \/>\n    while (current_size_b &lt;&#061; MAX_SIZE_B) {<br \/>\n        size_t element_count &#061; current_size_b \/ FLOAT_SIZE;  \/\/ \u5143\u7d20\u6570\u91cf<\/p>\n<p>        \/\/ \u8df3\u8fc7\u975e\u5bf9\u9f50\u5c3a\u5bf8<br \/>\n        if (current_size_b % FLOAT_SIZE !&#061; 0) {<br \/>\n            current_size_b *&#061; STEP_FACTOR;<br \/>\n            continue;<br \/>\n        }<\/p>\n<p>        \/\/ \u521d\u59cb\u5316\u8bbe\u5907\u6570\u636e&#xff08;\u901a\u8fc7\u4e3b\u673a\u5185\u5b58\u6b63\u786e\u8d4b\u503c\u4e3a1.0f&#xff09;<br \/>\n        for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n            MUSACHECK(musaSetDevice(i));<br \/>\n            float* host_buf &#061; (float*)malloc(current_size_b);<br \/>\n            for (size_t j &#061; 0; j &lt; element_count; &#043;&#043;j) host_buf[j] &#061; 1.0f;<br \/>\n            MUSACHECK(musaMemcpy(sendbuff[i], host_buf, current_size_b, musaMemcpyHostToDevice));<br \/>\n            free(host_buf);<br \/>\n            MUSACHECK(musaMemset(recvbuff[i], 0, current_size_b));<br \/>\n        }<\/p>\n<p>        \/\/ \u70ed\u8eab\u8fed\u4ee3&#xff08;\u5305\u542b\u6d41\u540c\u6b65&#xff09;<br \/>\n        for (int warmup &#061; 0; warmup &lt; WARMUP_ITERS; &#043;&#043;warmup) {<br \/>\n            MCCLCHECK(mcclGroupStart());<br \/>\n            for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n                MCCLCHECK(mcclAllReduce(<br \/>\n                    sendbuff[i], recvbuff[i],<br \/>\n                    element_count, DATA_TYPE, REDUCTION_OP,<br \/>\n                    comms[i], streams[i]<br \/>\n                ));<br \/>\n            }<br \/>\n            MCCLCHECK(mcclGroupEnd());<br \/>\n            for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n                MUSACHECK(musaSetDevice(i));<br \/>\n                MUSACHECK(musaStreamSynchronize(streams[i]));<br \/>\n            }<br \/>\n        }<\/p>\n<p>        \/\/ \u4e8b\u4ef6\u8ba1\u65f6&#xff08;\u4ec5\u5728\u4e3b\u8bbe\u59070\u64cd\u4f5c&#xff09;<br \/>\n        musaEvent_t start, stop;<br \/>\n        MUSACHECK(musaSetDevice(0));<br \/>\n        MUSACHECK(musaEventCreate(&amp;start));<br \/>\n        MUSACHECK(musaEventCreate(&amp;stop));<br \/>\n        MUSACHECK(musaEventRecord(start, streams[0]));<\/p>\n<p>        \/\/ \u6d4b\u8bd5\u8fed\u4ee3&#xff08;\u5305\u542b\u5b8c\u6574Group\u64cd\u4f5c&#xff09;<br \/>\n        MCCLCHECK(mcclGroupStart());<br \/>\n        for (int iter &#061; 0; iter &lt; TEST_ITERS; &#043;&#043;iter) {<br \/>\n            for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n                MUSACHECK(musaSetDevice(i));<br \/>\n                MCCLCHECK(mcclAllReduce(<br \/>\n                    sendbuff[i], recvbuff[i],<br \/>\n                    element_count, DATA_TYPE, REDUCTION_OP,<br \/>\n                    comms[i], streams[i]<br \/>\n                ));<br \/>\n            }<br \/>\n        }<br \/>\n        MCCLCHECK(mcclGroupEnd());<\/p>\n<p>        MUSACHECK(musaEventRecord(stop, streams[0]));<br \/>\n        MUSACHECK(musaEventSynchronize(stop));<\/p>\n<p>        \/\/ \u8ba1\u7b97\u5e73\u5747\u65f6\u95f4<br \/>\n        float total_time_ms;<br \/>\n        MUSACHECK(musaEventElapsedTime(&amp;total_time_ms, start, stop));<br \/>\n        double avg_time_us &#061; (total_time_ms \/ TEST_ITERS) * 1000;<\/p>\n<p>        \/\/ \u8ba1\u7b97\u5e26\u5bbd<br \/>\n        calculate_bandwidth(element_count, FLOAT_SIZE, avg_time_us \/ 1e6, &amp;alg_bw, &amp;bus_bw, nDev);<\/p>\n<p>        \/\/ \u9a8c\u8bc1\u7ed3\u679c&#xff08;\u5141\u8bb8\u6d6e\u70b9\u7cbe\u5ea6\u8bef\u5dee&#xff09;<br \/>\n        test_wrong &#061; 0;<br \/>\n        float expected &#061; (float)nDev;<br \/>\n        for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n            MUSACHECK(musaSetDevice(i));<br \/>\n            float* h_recv &#061; (float*)malloc(current_size_b);<br \/>\n            MUSACHECK(musaMemcpy(h_recv, recvbuff[i], current_size_b, musaMemcpyDeviceToHost));<br \/>\n            for (size_t j &#061; 0; j &lt; element_count; &#043;&#043;j) {<br \/>\n                if (fabs(h_recv[j] &#8211; expected) &gt; 1e-6) test_wrong&#043;&#043;;<br \/>\n            }<br \/>\n            free(h_recv);<br \/>\n        }<\/p>\n<p>        \/\/ \u6253\u5370\u7ed3\u679c<br \/>\n        printf(&#034;| %10&#034; PRIu64 &#034; | %10&#034; PRIu64 &#034; | %4s | %4d | %16.3f | %14.3f | %13.3f | %13.3f | %6d |\\\\n&#034;,<br \/>\n       (uint64_t)current_size_b, (uint64_t)element_count, &#034;float&#034;, ROOT_RANK, 0.0, avg_time_us, alg_bw, bus_bw, test_wrong);<\/p>\n<p>        \/\/ \u9500\u6bc1\u4e8b\u4ef6<br \/>\n        MUSACHECK(musaSetDevice(0));<br \/>\n        MUSACHECK(musaEventDestroy(start));<br \/>\n        MUSACHECK(musaEventDestroy(stop));<\/p>\n<p>        \/\/ \u589e\u5927\u6d4b\u8bd5\u5c3a\u5bf8<br \/>\n        current_size_b *&#061; STEP_FACTOR;<br \/>\n    }<\/p>\n<p>    \/\/ \u91ca\u653e\u8d44\u6e90<br \/>\n    for (int i &#061; 0; i &lt; nDev; &#043;&#043;i) {<br \/>\n        MUSACHECK(musaSetDevice(i));<br \/>\n        MUSACHECK(musaFree(sendbuff[i]));<br \/>\n        MUSACHECK(musaFree(recvbuff[i]));<br \/>\n        MUSACHECK(musaStreamDestroy(streams[i]));<br \/>\n        mcclCommDestroy(comms[i]);<br \/>\n    }<br \/>\n    free(sendbuff);<br \/>\n    free(recvbuff);<\/p>\n<p>    printf(&#034;AllReduce Test Completed Successfully\\\\n&#034;);<br \/>\n    return 0;<br \/>\n} <\/p>\n<h4>\u7f16\u8bd1<\/h4>\n<p>\u56e0\u4e3a\u4ee3\u7801\u7528\u4e86musa_runtime\u4e0emccl\u4e24\u4e2a\u5e93&#xff0c;\u56e0\u6b64\u7f16\u8bd1\u9009\u9879\u4e5f\u4f1a\u6709\u6240\u6539\u53d8<\/p>\n<p>mcc allreduce.mu -o allreduce -lmusart -lmccl<\/p>\n<h4>\u7ed3\u679c<\/h4>\n<p>\u4e0d\u77e5\u9053\u4e3a\u4ec0\u4e48\u7ed3\u679c\u6d4b\u51fa\u6765\u548c\u7528pytorch\u6d4b\u51fa\u6765\u7ed3\u679c\u76f8\u5dee\u4e0d\u5c0f&#xff0c;\u76ee\u6d4b\u662f\u56e0\u4e3amusa event\u6253\u70b9\u8ba1\u65f6\u51fd\u6570\u6ca1\u4f7f\u7528\u6b63\u786e&#xff08;\u5728p2p\u6d4b\u8bd5\u7684\u81ea\u4ea4\u4e2d\u4e5f\u6709\u4f53\u73b0&#xff0c;\u4e0d\u7ba1\u4ec0\u4e48\u60c5\u51b5\u90fd\u662f50us\u5de6\u53f3&#xff09;&#xff0c;\u8fd9\u4e2a\u9700\u8981\u540e\u7eed\u518d\u770b\u4e0b<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"645\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054355-6836a29b44ff8.png\" width=\"1197\" \/><\/p>\n<p><img loading=\"lazy\" decoding=\"async\" alt=\"\" height=\"759\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054355-6836a29b9a600.png\" width=\"687\" \/><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb1.1k\u6b21\uff0c\u70b9\u8d5e16\u6b21\uff0c\u6536\u85cf12\u6b21\u3002MTT S4000 \u662f\u57fa\u4e8e\u6469\u5c14\u7ebf\u7a0b\u66f2\u9662 GPU \u67b6\u6784\u6253\u9020\u7684\u5168\u529f\u80fd\u5143\u8ba1\u7b97\u5361\uff0c\u4e3a\u5343\u4ebf\u89c4\u6a21\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u8fdb\u884c\u4e86\u5b9a\u5236\u4f18\u5316\uff0c\u7ed3\u5408\u5148\u8fdb\u7684\u56fe\u5f62\u6e32\u67d3\u80fd\u529b\u3001\u89c6\u9891\u7f16\u89e3\u7801\u80fd\u529b\u548c\u8d85\u9ad8\u6e05 8K HDR \u663e\u793a\u80fd\u529b\uff0c\u52a9\u529b\u4eba\u5de5\u667a\u80fd\u3001\u56fe\u5f62\u6e32\u67d3\u3001\u591a\u5a92\u4f53\u3001\u79d1\u5b66\u8ba1\u7b97\u4e0e\u7269\u7406\u4eff\u771f\u7b49\u590d\u5408\u5e94\u7528\u573a\u666f\u7684\u8ba1\u7b97\u52a0\u901f\u3002MTT S4000 \u5168\u9762\u652f\u6301\u5927\u8bed\u8a00\u6a21\u578b\u7684\u9884\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u670d\u52a1\uff0cMUSA \u8f6f\u4ef6\u6808\u4e13\u95e8\u9488\u5bf9\u5927\u89c4\u6a21\u96c6\u7fa4\u7684\u5206\u5e03\u5f0f\u8ba1\u7b97\u6027\u80fd\u8fdb\u884c\u4e86\u4f18\u5316\uff0c\u9002\u914d\u4e3b\u6d41\u5206\u5e03\u5f0f\u8ba1\u7b97\u52a0\u901f\u6846\u67b6\uff0c \u5305\u62ec DeepSpeed\uff0c Colossal AI\uff0cMegatron \u7b49\uff0c\u652f_\u6469\u5c14\u7ebf\u7a0b\u4fe1\u521b<\/p>\n","protected":false},"author":2,"featured_media":40324,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[66,1458,81,152,50,3694,43,86],"topic":[],"class_list":["post-40335","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-server","tag-ai","tag-hpc","tag-python","tag-pytorch","tag-50","tag-3694","tag-43","tag-86"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/40335.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb1.1k\u6b21\uff0c\u70b9\u8d5e16\u6b21\uff0c\u6536\u85cf12\u6b21\u3002MTT S4000 \u662f\u57fa\u4e8e\u6469\u5c14\u7ebf\u7a0b\u66f2\u9662 GPU \u67b6\u6784\u6253\u9020\u7684\u5168\u529f\u80fd\u5143\u8ba1\u7b97\u5361\uff0c\u4e3a\u5343\u4ebf\u89c4\u6a21\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u8fdb\u884c\u4e86\u5b9a\u5236\u4f18\u5316\uff0c\u7ed3\u5408\u5148\u8fdb\u7684\u56fe\u5f62\u6e32\u67d3\u80fd\u529b\u3001\u89c6\u9891\u7f16\u89e3\u7801\u80fd\u529b\u548c\u8d85\u9ad8\u6e05 8K HDR \u663e\u793a\u80fd\u529b\uff0c\u52a9\u529b\u4eba\u5de5\u667a\u80fd\u3001\u56fe\u5f62\u6e32\u67d3\u3001\u591a\u5a92\u4f53\u3001\u79d1\u5b66\u8ba1\u7b97\u4e0e\u7269\u7406\u4eff\u771f\u7b49\u590d\u5408\u5e94\u7528\u573a\u666f\u7684\u8ba1\u7b97\u52a0\u901f\u3002MTT S4000 \u5168\u9762\u652f\u6301\u5927\u8bed\u8a00\u6a21\u578b\u7684\u9884\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u670d\u52a1\uff0cMUSA \u8f6f\u4ef6\u6808\u4e13\u95e8\u9488\u5bf9\u5927\u89c4\u6a21\u96c6\u7fa4\u7684\u5206\u5e03\u5f0f\u8ba1\u7b97\u6027\u80fd\u8fdb\u884c\u4e86\u4f18\u5316\uff0c\u9002\u914d\u4e3b\u6d41\u5206\u5e03\u5f0f\u8ba1\u7b97\u52a0\u901f\u6846\u67b6\uff0c \u5305\u62ec DeepSpeed\uff0c Colossal AI\uff0cMegatron \u7b49\uff0c\u652f_\u6469\u5c14\u7ebf\u7a0b\u4fe1\u521b\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/40335.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2025-05-28T05:43:56+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054352-6836a29866a81.png\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"15 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/40335.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/40335.html\",\"name\":\"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2025-05-28T05:43:56+00:00\",\"dateModified\":\"2025-05-28T05:43:56+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/40335.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/40335.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/40335.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/40335.html","og_locale":"zh_CN","og_type":"article","og_title":"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb1.1k\u6b21\uff0c\u70b9\u8d5e16\u6b21\uff0c\u6536\u85cf12\u6b21\u3002MTT S4000 \u662f\u57fa\u4e8e\u6469\u5c14\u7ebf\u7a0b\u66f2\u9662 GPU \u67b6\u6784\u6253\u9020\u7684\u5168\u529f\u80fd\u5143\u8ba1\u7b97\u5361\uff0c\u4e3a\u5343\u4ebf\u89c4\u6a21\u5927\u8bed\u8a00\u6a21\u578b\u7684\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u8fdb\u884c\u4e86\u5b9a\u5236\u4f18\u5316\uff0c\u7ed3\u5408\u5148\u8fdb\u7684\u56fe\u5f62\u6e32\u67d3\u80fd\u529b\u3001\u89c6\u9891\u7f16\u89e3\u7801\u80fd\u529b\u548c\u8d85\u9ad8\u6e05 8K HDR \u663e\u793a\u80fd\u529b\uff0c\u52a9\u529b\u4eba\u5de5\u667a\u80fd\u3001\u56fe\u5f62\u6e32\u67d3\u3001\u591a\u5a92\u4f53\u3001\u79d1\u5b66\u8ba1\u7b97\u4e0e\u7269\u7406\u4eff\u771f\u7b49\u590d\u5408\u5e94\u7528\u573a\u666f\u7684\u8ba1\u7b97\u52a0\u901f\u3002MTT S4000 \u5168\u9762\u652f\u6301\u5927\u8bed\u8a00\u6a21\u578b\u7684\u9884\u8bad\u7ec3\u3001\u5fae\u8c03\u548c\u63a8\u7406\u670d\u52a1\uff0cMUSA \u8f6f\u4ef6\u6808\u4e13\u95e8\u9488\u5bf9\u5927\u89c4\u6a21\u96c6\u7fa4\u7684\u5206\u5e03\u5f0f\u8ba1\u7b97\u6027\u80fd\u8fdb\u884c\u4e86\u4f18\u5316\uff0c\u9002\u914d\u4e3b\u6d41\u5206\u5e03\u5f0f\u8ba1\u7b97\u52a0\u901f\u6846\u67b6\uff0c \u5305\u62ec DeepSpeed\uff0c Colossal AI\uff0cMegatron \u7b49\uff0c\u652f_\u6469\u5c14\u7ebf\u7a0b\u4fe1\u521b","og_url":"https:\/\/www.wsisp.com\/helps\/40335.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2025-05-28T05:43:56+00:00","og_image":[{"url":"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/05\/20250528054352-6836a29866a81.png"}],"author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"15 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/40335.html","url":"https:\/\/www.wsisp.com\/helps\/40335.html","name":"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2025-05-28T05:43:56+00:00","dateModified":"2025-05-28T05:43:56+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/40335.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/40335.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/40335.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u6469\u5c14\u7ebf\u7a0bS4000\u56fd\u4ea7\u4fe1\u521b\u8ba1\u7b97\u5361\u6027\u80fd\u5b9e\u6218\u2014\u2014Pytorch\u8f6c\u8bd1\uff0c\u591a\u5361P2P\u901a\u4fe1\u4e0eMUSA\u7f16\u7a0b"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/40335","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=40335"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/40335\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media\/40324"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=40335"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=40335"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=40335"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=40335"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}