{"id":64383,"date":"2026-01-23T13:40:30","date_gmt":"2026-01-23T05:40:30","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/64383.html"},"modified":"2026-01-23T13:40:30","modified_gmt":"2026-01-23T05:40:30","slug":"%e5%9b%be%e8%a7%a3transformer%e4%b8%8epytorch%e5%ae%9e%e7%8e%b0","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/64383.html","title":{"rendered":"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0"},"content":{"rendered":"<h2>\u524d\u8a00&#xff1a;\u63a8\u5f00\u90a3\u6247\u901a\u5f80\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d\u7684\u95e8<\/h2>\n<h4>&#x1f32a;\ufe0f \u4e00\u573a\u9759\u6084\u6084\u7684\u9769\u547d<\/h4>\n<p>\u628a\u65f6\u9488\u62e8\u56de 2017 \u5e74\u3002<\/p>\n<p>\u90a3\u65f6\u5019\u7684 AI \u8fd8\u5728\u8d39\u52b2\u5730\u50cf\u5c0f\u5b66\u751f\u4e00\u6837&#xff0c;\u4e00\u4e2a\u5b57\u4e00\u4e2a\u5b57\u5730\u8bfb\u8bfe\u6587&#xff08;RNN\/LSTM \u65f6\u4ee3&#xff09;\u3002\u5b83\u8bfb\u5230\u53e5\u5b50\u672b\u5c3e\u65f6&#xff0c;\u5f80\u5f80\u5df2\u7ecf\u5fd8\u4e86\u5f00\u5934\u8bb2\u4e86\u4ec0\u4e48\u3002<\/p>\n<p>\u5c31\u5728\u90a3\u4e00\u5e74&#xff0c;Google \u7684\u51e0\u4f4d\u5de5\u7a0b\u5e08\u4e22\u51fa\u4e86\u4e00\u7bc7\u8bba\u6587\u2014\u2014\u300aAttention Is All You Need\u300b\u3002<\/p>\n<p>\u8fd9\u7bc7\u8bba\u6587\u5c31\u50cf\u4e00\u5757\u5de8\u77f3\u7838\u8fdb\u4e86\u5e73\u9759\u7684\u6e56\u9762&#xff0c;\u6fc0\u8d77\u7684\u6d9f\u6f2a\u6700\u7ec8\u6f14\u53d8\u6210\u4e86\u4eca\u5929\u7684\u6d77\u5578&#xff1a;BERT\u3001GPT-3\u3001ChatGPT\u3001Sora\u2026\u2026 \u8fd9\u4e9b\u9707\u64bc\u4e16\u754c\u7684\u6a21\u578b&#xff0c;\u5265\u5f00\u5916\u8863\u540e&#xff0c;\u91cc\u9762\u8eba\u7740\u7684\u90fd\u662f\u540c\u4e00\u4e2a\u67b6\u6784\u2014\u2014Transformer\u3002<\/p>\n<p>\u5b83\u662f AI \u65f6\u4ee3\u7684\u84b8\u6c7d\u673a&#xff0c;\u662f\u901a\u5f80\u901a\u7528\u4eba\u5de5\u667a\u80fd&#xff08;AGI&#xff09;\u7684\u5165\u573a\u5238\u3002<\/p>\n<h4>&#x1f92f; \u4e3a\u4ec0\u4e48\u8981\u5199\u8fd9\u7bc7\u535a\u5ba2&#xff1f;<\/h4>\n<p>\u5982\u679c\u4f60\u5c1d\u8bd5\u8fc7\u53bb\u8bfb\u539f\u8bba\u6587&#xff0c;\u6216\u8005\u641c\u8fc7\u7f51\u4e0a\u7684\u6559\u7a0b&#xff0c;\u4f60\u53ef\u80fd\u7ecf\u5386\u8fc7\u4ee5\u4e0b\u5fc3\u6001\u5d29\u6e83\u7684\u77ac\u95f4&#xff1a;<\/p>\n<li>\u88ab\u6570\u5b66\u529d\u9000&#xff1a;\u770b\u5230\u6ee1\u5c4f\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         softmax<\/p>\n<p>         (<\/p>\n<p>           Q<\/p>\n<p>            K<\/p>\n<p>            T<\/p>\n<p>            d<\/p>\n<p>            k<\/p>\n<p>         )<\/p>\n<p>         V<\/p>\n<p>        \\\\text{softmax}(\\\\frac{QK^T}{\\\\sqrt{d_k}})V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.6275em;vertical-align: -0.538em\"><\/span><span class=\"mord text\"><span class=\"mord\">softmax<\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mopen nulldelimiter\"><\/span><span class=\"mfrac\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 1.0895em\"><span class=\"\" style=\"top: -2.5864em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord sqrt mtight\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8622em\"><span class=\"svg-align\" style=\"top: -3em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord mtight\" style=\"padding-left: 0.833em\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3448em\"><span class=\"\" style=\"top: -2.3488em;margin-left: 0em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0315em\">k<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1512em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -2.8222em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"hide-tail mtight\" style=\"min-width: 0.853em;height: 1.08em\"><\/p>\n<p>                    <\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1778em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -3.23em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"frac-line\" style=\"border-bottom-width: 0.04em\"><\/span><\/span><span class=\"\" style=\"top: -3.4461em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">Q<\/span><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.9191em\"><span class=\"\" style=\"top: -2.931em;margin-right: 0.0714em\"><span class=\"pstrut\" style=\"height: 2.5em\"><\/span><span class=\"sizing reset-size3 size1 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.538em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><span class=\"mclose nulldelimiter\"><\/span><\/span><span class=\"mclose\">)<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span> \u76f4\u63a5\u5173\u7f51\u9875\u3002<\/li>\n<li>\u88ab\u56fe\u8868\u529d\u9000&#xff1a;\u770b\u5230\u90a3\u4e9b\u7bad\u5934\u6ee1\u5929\u98de\u7684\u67b6\u6784\u56fe&#xff0c;\u773c\u775b\u5b66\u4f1a\u4e86&#xff0c;\u8111\u5b50\u6ca1\u5b66\u4f1a\u3002<\/li>\n<li>\u4f3c\u61c2\u975e\u61c2&#xff1a;\u201c\u6211\u77e5\u9053\u5b83\u53eb\u81ea\u6ce8\u610f\u529b\u673a\u5236&#xff0c;\u4f46\u5b83\u5230\u5e95\u5728\u6ce8\u610f\u4e2a\u5565&#xff1f;\u201d<\/li>\n<p>\u8fd9\u7bc7\u535a\u5ba2\u4e0d\u6253\u7b97\u8fd9\u4e48\u5e72\u3002<\/p>\n<p>\u6211\u4eec\u4e0d\u8ffd\u6c42\u628a\u6bcf\u4e2a\u6570\u5b66\u63a8\u5bfc\u90fd\u5199\u5f97\u5929\u8863\u65e0\u7f1d&#xff0c;\u6211\u4eec\u8ffd\u6c42\u7684\u662f\u76f4\u89c9&#xff08;Intuition&#xff09;\u3002 \u6211\u4f1a\u7528\u642d\u4e50\u9ad8\u3001\u67e5\u5b57\u5178\u3001\u5f00\u76f8\u4eb2\u5927\u4f1a\u3001\u505a\u9605\u8bfb\u7406\u89e3\u7b49\u5404\u79cd\u751f\u52a8\u7684\u6bd4\u55bb&#xff0c;\u5e26\u4f60\u94bb\u8fdb Transformer \u7684\u9ed1\u76d2\u5b50\u91cc&#xff0c;\u770b\u7740\u6570\u636e\u662f\u5982\u4f55\u5728\u91cc\u9762\u6d41\u52a8\u7684\u3002<\/p>\n<p>\u6211\u4eec\u5c06\u628a\u8fd9\u4e2a\u590d\u6742\u7684\u5de8\u517d\u62c6\u89e3\u6210\u4e00\u4e2a\u4e2a\u7b80\u5355\u7684\u96f6\u4ef6&#xff0c;\u6700\u540e\u4f60\u4f1a\u53d1\u73b0&#xff1a;\u539f\u6765\u6240\u8c13\u7684\u201c\u795e\u7ea7\u67b6\u6784\u201d&#xff0c;\u4e5f\u4e0d\u8fc7\u5c31\u662f\u7cbe\u5999\u7684\u5de5\u7a0b\u5b66\u79ef\u6728\u7f62\u4e86\u3002<\/p>\n<h4>&#x1f5fa;\ufe0f \u6211\u4eec\u7684\u63a2\u9669\u8def\u7ebf\u56fe<\/h4>\n<p>\u4e3a\u4e86\u4e0d\u8ba9\u4f60\u5728\u8ff7\u5bab\u91cc\u6655\u5934\u8f6c\u5411&#xff0c;\u8fd9\u662f\u6211\u4eec\u63a5\u4e0b\u6765\u7684\u65c5\u7a0b\u89c4\u5212&#xff1a;<\/p>\n<p>  #mermaid-svg-IXDGPm1GZoXtXwek{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-IXDGPm1GZoXtXwek .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-IXDGPm1GZoXtXwek .error-icon{fill:#552222;}#mermaid-svg-IXDGPm1GZoXtXwek .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-IXDGPm1GZoXtXwek .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-IXDGPm1GZoXtXwek .marker{fill:#333333;stroke:#333333;}#mermaid-svg-IXDGPm1GZoXtXwek .marker.cross{stroke:#333333;}#mermaid-svg-IXDGPm1GZoXtXwek svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-IXDGPm1GZoXtXwek p{margin:0;}#mermaid-svg-IXDGPm1GZoXtXwek .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-IXDGPm1GZoXtXwek .cluster-label text{fill:#333;}#mermaid-svg-IXDGPm1GZoXtXwek .cluster-label span{color:#333;}#mermaid-svg-IXDGPm1GZoXtXwek .cluster-label span p{background-color:transparent;}#mermaid-svg-IXDGPm1GZoXtXwek .label text,#mermaid-svg-IXDGPm1GZoXtXwek span{fill:#333;color:#333;}#mermaid-svg-IXDGPm1GZoXtXwek .node rect,#mermaid-svg-IXDGPm1GZoXtXwek .node circle,#mermaid-svg-IXDGPm1GZoXtXwek .node ellipse,#mermaid-svg-IXDGPm1GZoXtXwek .node polygon,#mermaid-svg-IXDGPm1GZoXtXwek .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-IXDGPm1GZoXtXwek .rough-node .label text,#mermaid-svg-IXDGPm1GZoXtXwek .node .label text,#mermaid-svg-IXDGPm1GZoXtXwek .image-shape .label,#mermaid-svg-IXDGPm1GZoXtXwek .icon-shape .label{text-anchor:middle;}#mermaid-svg-IXDGPm1GZoXtXwek .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-IXDGPm1GZoXtXwek .rough-node .label,#mermaid-svg-IXDGPm1GZoXtXwek .node .label,#mermaid-svg-IXDGPm1GZoXtXwek .image-shape .label,#mermaid-svg-IXDGPm1GZoXtXwek .icon-shape .label{text-align:center;}#mermaid-svg-IXDGPm1GZoXtXwek .node.clickable{cursor:pointer;}#mermaid-svg-IXDGPm1GZoXtXwek .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-IXDGPm1GZoXtXwek .arrowheadPath{fill:#333333;}#mermaid-svg-IXDGPm1GZoXtXwek .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-IXDGPm1GZoXtXwek .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-IXDGPm1GZoXtXwek .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-IXDGPm1GZoXtXwek .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-IXDGPm1GZoXtXwek .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-IXDGPm1GZoXtXwek .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-IXDGPm1GZoXtXwek .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-IXDGPm1GZoXtXwek .cluster text{fill:#333;}#mermaid-svg-IXDGPm1GZoXtXwek .cluster span{color:#333;}#mermaid-svg-IXDGPm1GZoXtXwek div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-IXDGPm1GZoXtXwek .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-IXDGPm1GZoXtXwek rect.text{fill:none;stroke-width:0;}#mermaid-svg-IXDGPm1GZoXtXwek .icon-shape,#mermaid-svg-IXDGPm1GZoXtXwek .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-IXDGPm1GZoXtXwek .icon-shape p,#mermaid-svg-IXDGPm1GZoXtXwek .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-IXDGPm1GZoXtXwek .icon-shape rect,#mermaid-svg-IXDGPm1GZoXtXwek .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-IXDGPm1GZoXtXwek .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-IXDGPm1GZoXtXwek .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-IXDGPm1GZoXtXwek :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u751f\u6210\u4e0e\u8f93\u51fa<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u6838\u5fc3\u7ec4\u4ef6\u62c6\u89e3<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8d77\u70b9<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c1\u7ae0: \u4e3a\u4ec0\u4e48\u8981\u629b\u5f03RNN?<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c2\u7ae0: \u5b8f\u89c2\u4fef\u77b0 Transformer<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c3\u7ae0: Embedding &amp; \u4f4d\u7f6e\u7f16\u7801<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c4-5\u7ae0: \u81ea\u6ce8\u610f\u529b &amp; \u591a\u5934\u673a\u5236<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c6-7\u7ae0: \u6b8b\u5dee\u8fde\u63a5 &amp; FFN<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c8\u7ae0: \u89e3\u7801\u5668\u7684 Mask \u79d8\u5bc6<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7b2c9\u7ae0: \u8f93\u51fa\u4e0e\u672a\u6765<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u7ec8\u70b9: \u89c9\u9192<\/p>\n<p><\/span><\/p>\n<hr \/>\n<h4>&#x1f4da; \u76ee\u5f55&#xff1a;Transformer \u62c6\u89e3\u4e4b\u65c5<\/h4>\n<ul>\n<li>\n<p>[\u7b2c\u4e00\u7ae0&#xff1a;\u544a\u522b\u201c\u63a5\u529b\u8dd1\u201d\u2014\u2014\u4e3a\u4ec0\u4e48\u8981\u629b\u5f03 RNN&#xff1f;]<\/p>\n<ul>\n<li>\u770b\u65e7\u65f6\u4ee3\u7684\u9738\u4e3b\u662f\u5982\u4f55\u56e0\u201c\u8bb0\u6027\u4e0d\u597d\u201d\u548c\u201c\u817f\u811a\u592a\u6162\u201d\u88ab\u5386\u53f2\u6dd8\u6c70\u7684\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u4e8c\u7ae0&#xff1a;\u9ed1\u76d2\u4e0e\u4e50\u9ad8\u2014\u2014\u5b8f\u89c2\u4fef\u77b0 Transformer]<\/p>\n<ul>\n<li>\u4e0d\u8c08\u7ec6\u8282&#xff0c;\u5148\u770b\u9aa8\u67b6\u3002\u8ba4\u8bc6 Encoder \u548c Decoder \u8fd9\u5bf9\u5b6a\u751f\u5144\u5f1f\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u4e09\u7ae0&#xff1a;\u5355\u8bcd\u7684 GPS\u2014\u2014Embedding \u4e0e\u4f4d\u7f6e\u7f16\u7801]<\/p>\n<ul>\n<li>\u673a\u5668\u4e0d\u8bc6\u5b57\u4e5f\u4e0d\u8bc6\u6570&#xff0c;\u600e\u4e48\u8ba9\u5b83\u77e5\u9053\u201c\u6211\u7231\u4f60\u201d\u548c\u201c\u4f60\u7231\u6211\u201d\u7684\u533a\u522b&#xff1f;<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u56db\u7ae0&#xff1a;\u7075\u9b42\u6838\u5fc3\u2014\u2014\u81ea\u6ce8\u610f\u529b\u673a\u5236 (Self-Attention)]<\/p>\n<ul>\n<li>\u5168\u7bc7\u6700\u786c\u6838&#xff01;\u770b AI \u5982\u4f55\u901a\u8fc7\u201c\u76f8\u4eb2\u5927\u4f1a\u201d\u627e\u51fa\u4e00\u53e5\u8bdd\u91cc\u7684\u9690\u85cf\u5173\u7cfb\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u4e94\u7ae0&#xff1a;\u591a\u5934\u6ce8\u610f\u529b (Multi-Head Attention)\u2014\u2014\u4e09\u4e2a\u81ed\u76ae\u5320&#xff0c;\u9876\u4e2a\u8bf8\u845b\u4eae]<\/p>\n<ul>\n<li>\u4e3a\u4ec0\u4e48\u8981\u641e 8 \u4e2a\u8111\u888b&#xff1f;\u8bba\u201c\u591a\u7ef4\u5ea6\u601d\u8003\u201d\u7684\u91cd\u8981\u6027\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u516d\u7ae0&#xff1a;\u8fde\u63a5\u4e0e\u5f52\u4e00\u5316\u2014\u2014\u6b8b\u5dee\u8fde\u63a5 (Residual) \u4e0e LayerNorm]<\/p>\n<ul>\n<li>\u9632\u6b62\u697c\u76d6\u584c\u7684\u5de5\u7a0b\u5b66\u5947\u8ff9&#xff0c;\u7ed9\u4fe1\u606f\u5f00\u901a\u201cVIP \u76f4\u901a\u8f66\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u4e03\u7ae0&#xff1a;\u524d\u9988\u795e\u7ecf\u7f51\u7edc (Feed-Forward Networks)\u2014\u2014\u8bb0\u5fc6\u7684\u7cbe\u70bc]<\/p>\n<ul>\n<li>Attention \u8d1f\u8d23\u793e\u4ea4&#xff0c;FFN \u8d1f\u8d23\u601d\u8003\u3002\u63ed\u79d8\u53c2\u6570\u91cf\u6700\u5927\u7684\u201c\u8bb0\u5fc6\u533a\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u516b\u7ae0&#xff1a;\u89e3\u7801\u5668\u7684\u72ec\u89d2\u620f\u2014\u2014Masked Multi-Head Attention]<\/p>\n<ul>\n<li>\u8003\u8bd5\u4e0d\u80fd\u4f5c\u5f0a&#xff01;\u770b Decoder \u5982\u4f55\u6234\u7740\u201c\u773c\u7f69\u201d\u9884\u6d4b\u672a\u6765\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u7b2c\u4e5d\u7ae0&#xff1a;\u7ec8\u7ae0\u2014\u2014\u8f93\u51fa\u4e0e\u672a\u6765]<\/p>\n<ul>\n<li>\u4ece\u4e00\u5806\u6570\u5b57\u53d8\u56de\u4eba\u7c7b\u8bed\u8a00\u3002BERT \u548c GPT \u5bb6\u65cf\u662f\u5982\u4f55\u5206\u9053\u626c\u9573\u7684&#xff1f;<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>[\u9644\u5f55 1&#xff1a;\u4ee3\u7801\u5b9e\u6218] &#xff08;PyTorch \u9010\u884c\u624b\u6495 Transformer&#xff09;<\/p>\n<\/li>\n<li>\n<p>[\u540e\u8bb0] &#xff08;\u8d70\u51fa\u9ed1\u76d2&#xff0c;\u770b\u89c1\u661f\u8fb0\u5927\u6d77&#xff09;<\/p>\n<\/li>\n<\/ul>\n<hr \/>\n<h2>\u7b2c\u4e00\u7ae0&#xff1a;\u544a\u522b\u201c\u63a5\u529b\u8dd1\u201d\u2014\u2014\u4e3a\u4ec0\u4e48\u8981\u629b\u5f03 RNN&#xff1f;<\/h2>\n<p>\u5728 2017 \u5e74\u90a3\u7bc7\u8457\u540d\u7684\u8bba\u6587\u300aAttention Is All You Need\u300b\u6a2a\u7a7a\u51fa\u4e16\u4e4b\u524d&#xff0c;\u81ea\u7136\u8bed\u8a00\u5904\u7406&#xff08;NLP&#xff09;\u7684\u4e16\u754c\u662f\u7531 RNN&#xff08;\u5faa\u73af\u795e\u7ecf\u7f51\u7edc&#xff09; \u548c\u5b83\u7684\u5347\u7ea7\u7248 LSTM&#xff08;\u957f\u77ed\u671f\u8bb0\u5fc6\u7f51\u7edc&#xff09; \u7edf\u6cbb\u7684\u3002<\/p>\n<p>\u867d\u7136\u5b83\u4eec\u5728\u5f53\u65f6\u4e5f\u662f\u5212\u65f6\u4ee3\u7684\u53d1\u660e&#xff0c;\u4f46\u5b83\u4eec\u8eab\u4e0a\u6709\u4e00\u4e2a\u81f4\u547d\u7684\u57fa\u56e0\u7f3a\u9677\u2014\u2014\u65f6\u5e8f\u4f9d\u8d56&#xff08;Sequential Dependency&#xff09;\u3002<\/p>\n<h4>1. \u4ee5\u524d\u7684 AI \u662f\u600e\u4e48\u201c\u8bfb\u4e66\u201d\u7684&#xff1f;<\/h4>\n<p>\u60f3\u8c61\u4e00\u4e0b&#xff0c;\u4f60\u8ba9\u4e00\u4e2a\u65e7\u65f6\u4ee3\u7684 AI \u7ffb\u8bd1\u8fd9\u53e5\u8bdd&#xff1a;<\/p>\n<p>\u201cThe cat didn\u2019t cross the street because it was too tired.\u201d &#xff08;\u8fd9\u53ea\u732b\u6ca1\u6709\u8fc7\u9a6c\u8def&#xff0c;\u56e0\u4e3a\u5b83\u592a\u7d2f\u4e86\u3002&#xff09;<\/p>\n<p>RNN \u5904\u7406\u8fd9\u53e5\u8bdd\u7684\u65b9\u5f0f&#xff0c;\u5c31\u50cf\u662f\u4e00\u4e2a\u53ea\u80fd\u770b\u89c1\u773c\u524d\u4e00\u4e2a\u5355\u8bcd\u7684\u9605\u8bfb\u8005&#xff0c;\u6216\u8005\u50cf\u662f\u5728\u73a9**\u201c\u4f20\u8bdd\u6e38\u620f\u201d**&#xff08;\u63a5\u529b\u8dd1&#xff09;\u3002<\/p>\n<li>\u5b83\u8bfb\u5165 The&#xff0c;\u5927\u8111\u91cc\u751f\u6210\u4e00\u4e2a\u8bb0\u5fc6&#xff08;\u9690\u85cf\u72b6\u6001 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          h<\/p>\n<p>          1<\/p>\n<p>        h_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff09;\u3002<\/li>\n<li>\u5e26\u7740 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          h<\/p>\n<p>          1<\/p>\n<p>        h_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u8bb0\u5fc6&#xff0c;\u5b83\u8bfb\u5165 cat&#xff0c;\u751f\u6210\u65b0\u7684\u8bb0\u5fc6 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          h<\/p>\n<p>          2<\/p>\n<p>        h_2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>\u3002<\/li>\n<li>\u5e26\u7740 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          h<\/p>\n<p>          2<\/p>\n<p>        h_2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u8bb0\u5fc6&#xff0c;\u5b83\u8bfb\u5165 didn&#039;t\u2026\u2026<\/li>\n<li>\u2026\u2026<\/li>\n<li>\u76f4\u5230\u8bfb\u5230\u6700\u540e\u9762\u7684 it \u65f6&#xff0c;\u5b83\u5fc5\u987b\u5728\u65e0\u6570\u6b21\u201c\u8bb0\u5fc6\u66f4\u65b0\u201d\u540e&#xff0c;\u4f9d\u7136\u80fd\u51c6\u786e\u5730\u4ece\u8111\u5b50\u91cc\u7ffb\u51fa cat \u7684\u4fe1\u606f&#xff0c;\u4ece\u800c\u77e5\u9053\u8fd9\u4e2a it \u6307\u7684\u662f\u732b&#xff0c;\u800c\u4e0d\u662f\u9a6c\u8def\u3002<\/li>\n<h5>\u274c \u75db\u70b9\u4e00&#xff1a;\u8ddd\u79bb\u4ea7\u751f\u7684\u4e0d\u662f\u7f8e&#xff0c;\u662f\u9057\u5fd8<\/h5>\n<p>\u5728\u8fd9\u4e2a\u201c\u63a5\u529b\u8dd1\u201d\u7684\u8fc7\u7a0b\u4e2d&#xff0c;\u968f\u7740\u53e5\u5b50\u8d8a\u6765\u8d8a\u957f&#xff0c;\u5f00\u5934\u7684\u4fe1\u606f\u5c31\u50cf\u4f20\u8bdd\u6e38\u620f\u4e00\u6837&#xff0c;\u8d8a\u4f20\u8d8a\u6a21\u7cca\u3002\u8fd9\u5c31\u662f\u8457\u540d\u7684\u957f\u8ddd\u79bb\u4f9d\u8d56\u95ee\u9898&#xff08;Long-term Dependency Problem&#xff09;\u3002\u867d\u7136 LSTM \u901a\u8fc7\u201c\u9057\u5fd8\u95e8\u201d\u5f3a\u884c\u7eed\u547d&#xff0c;\u4f46\u9762\u5bf9\u51e0\u767e\u4e2a\u8bcd\u7684\u957f\u6587&#xff0c;\u5b83\u4f9d\u7136\u4f1a\u201c\u8001\u5e74\u75f4\u5446\u201d\u3002<\/p>\n<h5>\u274c \u75db\u70b9\u4e8c&#xff1a;\u663e\u5361\u6709\u52b2\u513f\u4f7f\u4e0d\u4e0a<\/h5>\n<p>\u8fd9\u662f\u6700\u8ba9\u5de5\u7a0b\u5e08\u6293\u72c2\u7684\u3002\u56e0\u4e3a\u7b2c 100 \u4e2a\u8bcd\u7684\u8ba1\u7b97\u5fc5\u987b\u7b49\u7b2c 99 \u4e2a\u8bcd\u7b97\u5b8c\u3002\u8fd9\u610f\u5473\u7740\u4f60\u4e70\u4e86\u6700\u8d35\u7684 NVIDIA \u663e\u5361&#xff0c;\u6709\u51e0\u5343\u4e2a\u8ba1\u7b97\u6838\u5fc3&#xff0c;\u7ed3\u679c\u5b83\u4eec\u53ea\u80fd\u6392\u961f\u4e00\u4e2a\u63a5\u4e00\u4e2a\u5730\u5e72\u6d3b&#xff0c;\u65e0\u6cd5\u5e76\u884c\u3002<\/p>\n<p>\u6211\u4eec\u6765\u770b\u4e00\u5f20\u56fe&#xff0c;\u611f\u53d7\u4e00\u4e0b\u8fd9\u79cd\u201c\u618b\u5c48\u201d&#xff1a;<\/p>\n<p>  #mermaid-svg-aVKJOF1sPUtKQxdL{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-aVKJOF1sPUtKQxdL .error-icon{fill:#552222;}#mermaid-svg-aVKJOF1sPUtKQxdL .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-aVKJOF1sPUtKQxdL .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-aVKJOF1sPUtKQxdL .marker{fill:#333333;stroke:#333333;}#mermaid-svg-aVKJOF1sPUtKQxdL .marker.cross{stroke:#333333;}#mermaid-svg-aVKJOF1sPUtKQxdL svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-aVKJOF1sPUtKQxdL p{margin:0;}#mermaid-svg-aVKJOF1sPUtKQxdL .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster-label text{fill:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster-label span{color:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster-label span p{background-color:transparent;}#mermaid-svg-aVKJOF1sPUtKQxdL .label text,#mermaid-svg-aVKJOF1sPUtKQxdL span{fill:#333;color:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL .node rect,#mermaid-svg-aVKJOF1sPUtKQxdL .node circle,#mermaid-svg-aVKJOF1sPUtKQxdL .node ellipse,#mermaid-svg-aVKJOF1sPUtKQxdL .node polygon,#mermaid-svg-aVKJOF1sPUtKQxdL .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-aVKJOF1sPUtKQxdL .rough-node .label text,#mermaid-svg-aVKJOF1sPUtKQxdL .node .label text,#mermaid-svg-aVKJOF1sPUtKQxdL .image-shape .label,#mermaid-svg-aVKJOF1sPUtKQxdL .icon-shape .label{text-anchor:middle;}#mermaid-svg-aVKJOF1sPUtKQxdL .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-aVKJOF1sPUtKQxdL .rough-node .label,#mermaid-svg-aVKJOF1sPUtKQxdL .node .label,#mermaid-svg-aVKJOF1sPUtKQxdL .image-shape .label,#mermaid-svg-aVKJOF1sPUtKQxdL .icon-shape .label{text-align:center;}#mermaid-svg-aVKJOF1sPUtKQxdL .node.clickable{cursor:pointer;}#mermaid-svg-aVKJOF1sPUtKQxdL .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-aVKJOF1sPUtKQxdL .arrowheadPath{fill:#333333;}#mermaid-svg-aVKJOF1sPUtKQxdL .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-aVKJOF1sPUtKQxdL .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-aVKJOF1sPUtKQxdL .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-aVKJOF1sPUtKQxdL .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-aVKJOF1sPUtKQxdL .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-aVKJOF1sPUtKQxdL .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster text{fill:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL .cluster span{color:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-aVKJOF1sPUtKQxdL .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-aVKJOF1sPUtKQxdL rect.text{fill:none;stroke-width:0;}#mermaid-svg-aVKJOF1sPUtKQxdL .icon-shape,#mermaid-svg-aVKJOF1sPUtKQxdL .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-aVKJOF1sPUtKQxdL .icon-shape p,#mermaid-svg-aVKJOF1sPUtKQxdL .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-aVKJOF1sPUtKQxdL .icon-shape rect,#mermaid-svg-aVKJOF1sPUtKQxdL .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-aVKJOF1sPUtKQxdL .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-aVKJOF1sPUtKQxdL .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-aVKJOF1sPUtKQxdL :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>RNN \u7684\u63a5\u529b\u8dd1\u6a21\u5f0f (\u4e32\u884c)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u8def\u9014\u9065\u8fdc&#xff0c;\u4fe1\u606f\u8870\u51cf<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Input: The<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>h1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>h2<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Input: cat<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>h3<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Input: &#8230;<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>h99<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>h100<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Input: it<\/p>\n<p><\/span><\/p>\n<p>\u56fe\u89e3\u8bf4\u660e&#xff1a;\u6ce8\u610f\u90a3\u4e2a\u4ece <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          h<\/p>\n<p>          1<\/p>\n<p>        h_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u5230 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          h<\/p>\n<p>          100<\/p>\n<p>        h_{100}<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">h<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">100<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u865a\u7ebf\u3002\u5728 RNN \u4e2d&#xff0c;it \u60f3\u8981\u7406\u89e3\u81ea\u5df1\u662f\u6307\u4ee3 cat&#xff0c;\u9700\u8981\u8de8\u8d8a\u6f2b\u957f\u7684\u65f6\u95f4\u6b65&#xff0c;\u4fe1\u53f7\u65e9\u5df2\u5fae\u5f31\u4e0d\u582a\u3002<\/p>\n<hr \/>\n<h4>2. Transformer \u7684\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d<\/h4>\n<p>Transformer \u7684\u51fa\u73b0&#xff0c;\u5c31\u50cf\u662f\u628a\u8fd9\u4e2a\u9010\u5b57\u9605\u8bfb\u7684 AI&#xff0c;\u53d8\u6210\u4e86\u4e00\u4e2a\u8fc7\u76ee\u4e0d\u5fd8\u7684\u901f\u8bfb\u5929\u624d\u3002<\/p>\n<p>\u5b83\u629b\u5f03\u4e86\u5faa\u73af&#xff08;Recurrence&#xff09;&#xff0c;\u5b83\u4e0d\u518d\u4e00\u4e2a\u8bcd\u4e00\u4e2a\u8bcd\u5730\u8bfb&#xff0c;\u800c\u662f\u4e00\u5df4\u638c\u628a\u6574\u53e5\u8bdd\u62cd\u5728\u684c\u5b50\u4e0a&#xff0c;\u540c\u65f6\u5904\u7406\u6240\u6709\u7684\u8bcd&#xff01;<\/p>\n<ul>\n<li>\u6ca1\u6709\u65f6\u95f4\u5148\u540e&#xff1a;\u5b83\u4e0d\u9700\u8981\u7b49 The \u8bfb\u5b8c\u624d\u8bfb cat\u3002<\/li>\n<li>\u5e76\u884c\u8ba1\u7b97&#xff1a;\u663e\u5361\u91cc\u7684\u51e0\u5343\u4e2a\u6838\u5fc3\u7ec8\u4e8e\u53ef\u4ee5\u540c\u65f6\u5f00\u5de5\u4e86&#xff0c;\u901f\u5ea6\u8d77\u98de\u3002<\/li>\n<li>\u65e0\u89c6\u8ddd\u79bb&#xff1a;\u5728 Transformer \u773c\u91cc&#xff0c;\u7b2c 1 \u4e2a\u8bcd\u548c\u7b2c 100 \u4e2a\u8bcd\u4e4b\u95f4\u7684\u8ddd\u79bb\u662f 0\u3002\u5b83\u4eec\u4e4b\u95f4\u53ef\u4ee5\u76f4\u63a5\u201c\u7709\u6765\u773c\u53bb\u201d&#xff08;\u8ba1\u7b97\u6ce8\u610f\u529b&#xff09;&#xff0c;\u4e0d\u9700\u8981\u4e2d\u95f4\u5546\u4f20\u8bdd\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u5c31\u597d\u6bd4\u5927\u5bb6\u5728\u4e00\u4e2a\u5435\u95f9\u7684\u9e21\u5c3e\u9152\u4f1a\u4e0a&#xff1a;<\/p>\n<ul>\n<li>RNN \u662f\u4e00\u4e2a\u4eba\u62ff\u7740\u9ea6\u514b\u98ce\u6328\u4e2a\u91c7\u8bbf&#xff0c;\u95ee\u5b8c\u8fd9\u4e2a\u624d\u80fd\u95ee\u4e0b\u4e00\u4e2a\u3002<\/li>\n<li>Transformer \u662f\u6240\u6709\u4eba\u540c\u65f6\u5728\u623f\u95f4\u91cc&#xff0c;\u8c01\u60f3\u548c\u8c01\u8bf4\u8bdd\u90fd\u53ef\u4ee5\u76f4\u63a5\u558a&#xff0c;\u5927\u5bb6\u90fd\u80fd\u77ac\u95f4\u542c\u5230\u5f7c\u6b64\u3002<\/li>\n<\/ul>\n<p>\u6211\u4eec\u6765\u770b Transformer \u662f\u5982\u4f55\u5de5\u4f5c\u7684&#xff1a;<\/p>\n<p>  #mermaid-svg-rkejNaFNd8LBkDye{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-rkejNaFNd8LBkDye .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-rkejNaFNd8LBkDye .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-rkejNaFNd8LBkDye .error-icon{fill:#552222;}#mermaid-svg-rkejNaFNd8LBkDye .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-rkejNaFNd8LBkDye .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-rkejNaFNd8LBkDye .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-rkejNaFNd8LBkDye .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-rkejNaFNd8LBkDye .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-rkejNaFNd8LBkDye .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-rkejNaFNd8LBkDye .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-rkejNaFNd8LBkDye .marker{fill:#333333;stroke:#333333;}#mermaid-svg-rkejNaFNd8LBkDye .marker.cross{stroke:#333333;}#mermaid-svg-rkejNaFNd8LBkDye svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-rkejNaFNd8LBkDye p{margin:0;}#mermaid-svg-rkejNaFNd8LBkDye .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-rkejNaFNd8LBkDye .cluster-label text{fill:#333;}#mermaid-svg-rkejNaFNd8LBkDye .cluster-label span{color:#333;}#mermaid-svg-rkejNaFNd8LBkDye .cluster-label span p{background-color:transparent;}#mermaid-svg-rkejNaFNd8LBkDye .label text,#mermaid-svg-rkejNaFNd8LBkDye span{fill:#333;color:#333;}#mermaid-svg-rkejNaFNd8LBkDye .node rect,#mermaid-svg-rkejNaFNd8LBkDye .node circle,#mermaid-svg-rkejNaFNd8LBkDye .node ellipse,#mermaid-svg-rkejNaFNd8LBkDye .node polygon,#mermaid-svg-rkejNaFNd8LBkDye .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-rkejNaFNd8LBkDye .rough-node .label text,#mermaid-svg-rkejNaFNd8LBkDye .node .label text,#mermaid-svg-rkejNaFNd8LBkDye .image-shape .label,#mermaid-svg-rkejNaFNd8LBkDye .icon-shape .label{text-anchor:middle;}#mermaid-svg-rkejNaFNd8LBkDye .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-rkejNaFNd8LBkDye .rough-node .label,#mermaid-svg-rkejNaFNd8LBkDye .node .label,#mermaid-svg-rkejNaFNd8LBkDye .image-shape .label,#mermaid-svg-rkejNaFNd8LBkDye .icon-shape .label{text-align:center;}#mermaid-svg-rkejNaFNd8LBkDye .node.clickable{cursor:pointer;}#mermaid-svg-rkejNaFNd8LBkDye .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-rkejNaFNd8LBkDye .arrowheadPath{fill:#333333;}#mermaid-svg-rkejNaFNd8LBkDye .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-rkejNaFNd8LBkDye .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-rkejNaFNd8LBkDye .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-rkejNaFNd8LBkDye .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-rkejNaFNd8LBkDye .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-rkejNaFNd8LBkDye .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-rkejNaFNd8LBkDye .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-rkejNaFNd8LBkDye .cluster text{fill:#333;}#mermaid-svg-rkejNaFNd8LBkDye .cluster span{color:#333;}#mermaid-svg-rkejNaFNd8LBkDye div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-rkejNaFNd8LBkDye .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-rkejNaFNd8LBkDye rect.text{fill:none;stroke-width:0;}#mermaid-svg-rkejNaFNd8LBkDye .icon-shape,#mermaid-svg-rkejNaFNd8LBkDye .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-rkejNaFNd8LBkDye .icon-shape p,#mermaid-svg-rkejNaFNd8LBkDye .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-rkejNaFNd8LBkDye .icon-shape rect,#mermaid-svg-rkejNaFNd8LBkDye .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-rkejNaFNd8LBkDye .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-rkejNaFNd8LBkDye .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-rkejNaFNd8LBkDye :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Transformer \u7684\u4e0a\u5e1d\u89c6\u89d2<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u6240\u6709\u8bcd\u540c\u65f6\u8f93\u5165<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u76f4\u63a5\u8fde\u63a5<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u76f4\u63a5\u8fde\u63a5<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Input: The cat &#8230; it<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Transformer \u6838\u5fc3\u5904\u7406\u533a<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>The<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>it<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>cat<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Output: \u7ffb\u8bd1\u7ed3\u679c<\/p>\n<p><\/span><\/p>\n<p>\u56fe\u89e3\u8bf4\u660e&#xff1a;\u5728 Transformer \u4e2d&#xff0c;cat \u548c it \u4e4b\u95f4\u6709\u4e00\u6761\u76f4\u63a5\u7684\u8fde\u7ebf&#xff08;\u901a\u8fc7 Self-Attention \u673a\u5236&#xff0c;\u540e\u9762\u7ae0\u8282\u4f1a\u7ec6\u8bb2&#xff09;\u3002\u65e0\u8bba\u53e5\u5b50\u591a\u957f&#xff0c;\u5b83\u4eec\u6c38\u8fdc\u662f\u201c\u90bb\u5c45\u201d\u3002<\/p>\n<h4>3. \u672c\u7ae0\u5c0f\u7ed3&#xff1a;\u65f6\u4ee3\u7684\u8f6c\u6298\u70b9<\/h4>\n<p>\u5982\u679c\u628a NLP \u6a21\u578b\u7684\u8fdb\u5316\u53f2\u770b\u4f5c\u4ea4\u901a\u5de5\u5177\u7684\u6f14\u53d8&#xff1a;<\/p>\n<ul>\n<li>RNN \u662f\u4e00\u5217\u8001\u5f0f\u706b\u8f66&#xff0c;\u8f66\u53a2\u5fc5\u987b\u4e00\u8282\u6263\u4e00\u8282&#xff0c;\u7b2c\u4e00\u8282\u52a8\u4e86&#xff0c;\u6700\u540e\u4e00\u8282\u624d\u80fd\u8ddf\u7740\u52a8\u3002<\/li>\n<li>Transformer \u662f\u4e00\u652f\u661f\u9645\u8230\u961f&#xff0c;\u6240\u6709\u98de\u8239&#xff08;\u5355\u8bcd&#xff09;\u5728\u592a\u7a7a\u4e2d\u5c55\u5f00&#xff0c;\u5f7c\u6b64\u4e4b\u95f4\u901a\u8fc7\u65e0\u7ebf\u7535\u77ac\u65f6\u8054\u7edc&#xff0c;\u65e2\u80fd\u72ec\u7acb\u673a\u52a8&#xff0c;\u53c8\u80fd\u6574\u4f53\u534f\u540c\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u4e00\u7ae0\u4f60\u9700\u8981\u8bb0\u4f4f\u7684\u6838\u5fc3\u6982\u5ff5&#xff1a;<\/p>\n<li>\u629b\u5f03 RNN \u662f\u4e3a\u4e86\u89e3\u51b3\u65e0\u6cd5\u5e76\u884c\u8bad\u7ec3&#xff08;\u6162&#xff09;\u548c\u957f\u8ddd\u79bb\u9057\u5fd8&#xff08;\u7b28&#xff09;\u7684\u95ee\u9898\u3002<\/li>\n<li>Transformer \u7684\u5fc5\u6740\u6280\u662f\u5e76\u884c\u5316\u548c\u5168\u5c40\u89c6\u91ce\u3002<\/li>\n<hr \/>\n<p>\u8fd9\u5c31\u5f15\u51fa\u4e86\u4e00\u4e2a\u5de8\u5927\u7684\u60ac\u5ff5&#xff1a;\u65e2\u7136\u6ca1\u6709\u4e86\u5148\u540e\u987a\u5e8f&#xff0c;Transformer \u600e\u4e48\u77e5\u9053\u201c\u6211\u7231\u4f60\u201d\u548c\u201c\u4f60\u7231\u6211\u201d\u7684\u533a\u522b\u5462&#xff1f;\u6bd5\u7adf\u8fd9\u4e24\u4e2a\u53e5\u5b50\u7684\u5355\u8bcd\u5b8c\u5168\u4e00\u6837\u554a&#xff01;\u8fd9\u5c31\u662f\u6211\u4eec\u7b2c\u4e8c\u7ae0\u548c\u7b2c\u4e09\u7ae0\u8981\u89e3\u51b3\u7684\u95ee\u9898\u3002<\/p>\n<hr \/>\n<p>\u5728\u7b2c\u4e00\u7ae0&#xff0c;\u6211\u4eec\u5df2\u7ecf\u628a RNN \u6254\u8fdb\u4e86\u5386\u53f2\u7684\u5783\u573e\u6876\u3002\u73b0\u5728&#xff0c;\u6211\u4eec\u8981\u628a Transformer \u642c\u4e0a\u89e3\u5256\u53f0\u3002<\/p>\n<p>\u5728\u6df1\u5165\u90a3\u4e9b\u590d\u6742\u7684\u77e9\u9635\u4e58\u6cd5\u4e4b\u524d&#xff0c;\u6211\u4eec\u5148\u5f97\u9000\u540e\u4e00\u6b65&#xff0c;\u7528\u5b8f\u89c2\u7684\u89c6\u89d2\u770b\u770b\u5b83\u7684\u6574\u4f53\u9aa8\u67b6\u3002\u5982\u679c\u4f60\u4e00\u4e0a\u6765\u5c31\u94bb\u8fdb\u201c\u81ea\u6ce8\u610f\u529b\u201d\u7684\u7ec6\u8282&#xff0c;\u5f88\u5bb9\u6613\u53ea\u89c1\u6811\u6728\u4e0d\u89c1\u68ee\u6797\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u4e8c\u7ae0&#xff1a;\u9ed1\u76d2\u4e0e\u4e50\u9ad8\u2014\u2014\u5b8f\u89c2\u4fef\u77b0 Transformer<\/h2>\n<p>\u5f88\u591a\u6280\u672f\u535a\u5ba2\u4e0a\u6765\u5c31\u7ed9\u4f60\u6254\u51fa\u4e00\u5f20\u5bc6\u5bc6\u9ebb\u9ebb\u7684\u5185\u90e8\u7ed3\u6784\u56fe&#xff0c;\u770b\u7740\u5c31\u5934\u6655\u3002\u6211\u4eec\u4e0d\u8fd9\u4e48\u5e72\u3002\u6211\u4eec\u5148\u5047\u8bbe Transformer \u5c31\u662f\u4e00\u4e2a\u5b8c\u5168\u5bc6\u5c01\u7684\u9ed1\u76d2\u5b50\u3002<\/p>\n<h4>1. \u6700\u7b80\u5355\u7684\u89c6\u89d2&#xff1a;\u795e\u5947\u7684\u7ffb\u8bd1\u673a<\/h4>\n<p>\u60f3\u8c61 Transformer \u662f\u4e00\u4e2a\u653e\u5728\u684c\u5b50\u4e0a\u7684\u9ed1\u8272\u65b9\u5757\u3002<\/p>\n<ul>\n<li>\u5de6\u8fb9\u5165\u53e3&#xff1a;\u585e\u8fdb\u53bb\u4e00\u53e5\u4e2d\u6587&#xff1a;\u201c\u6211\u7231\u5b66\u4e60\u201d\u3002<\/li>\n<li>\u53f3\u8fb9\u51fa\u53e3&#xff1a;\u5410\u51fa\u6765\u4e00\u53e5\u82f1\u6587&#xff1a;\u201cI love study\u201d\u3002<\/li>\n<\/ul>\n<p>\u6b64\u65f6&#xff0c;\u6211\u4eec\u4e0d\u9700\u8981\u77e5\u9053\u91cc\u9762\u53d1\u751f\u4e86\u4ec0\u4e48&#xff0c;\u53ea\u9700\u8981\u77e5\u9053\u5b83\u662f\u4e00\u4e2aSequence-to-Sequence(\u5e8f\u5217\u5230\u5e8f\u5217)\u7684\u6a21\u578b\u3002<\/p>\n<p>  #mermaid-svg-iQG7zzNYbwMPjLE9{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-iQG7zzNYbwMPjLE9 .error-icon{fill:#552222;}#mermaid-svg-iQG7zzNYbwMPjLE9 .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-iQG7zzNYbwMPjLE9 .marker{fill:#333333;stroke:#333333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .marker.cross{stroke:#333333;}#mermaid-svg-iQG7zzNYbwMPjLE9 svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-iQG7zzNYbwMPjLE9 p{margin:0;}#mermaid-svg-iQG7zzNYbwMPjLE9 .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster-label text{fill:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster-label span{color:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster-label span p{background-color:transparent;}#mermaid-svg-iQG7zzNYbwMPjLE9 .label text,#mermaid-svg-iQG7zzNYbwMPjLE9 span{fill:#333;color:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .node rect,#mermaid-svg-iQG7zzNYbwMPjLE9 .node circle,#mermaid-svg-iQG7zzNYbwMPjLE9 .node ellipse,#mermaid-svg-iQG7zzNYbwMPjLE9 .node polygon,#mermaid-svg-iQG7zzNYbwMPjLE9 .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .rough-node .label text,#mermaid-svg-iQG7zzNYbwMPjLE9 .node .label text,#mermaid-svg-iQG7zzNYbwMPjLE9 .image-shape .label,#mermaid-svg-iQG7zzNYbwMPjLE9 .icon-shape .label{text-anchor:middle;}#mermaid-svg-iQG7zzNYbwMPjLE9 .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .rough-node .label,#mermaid-svg-iQG7zzNYbwMPjLE9 .node .label,#mermaid-svg-iQG7zzNYbwMPjLE9 .image-shape .label,#mermaid-svg-iQG7zzNYbwMPjLE9 .icon-shape .label{text-align:center;}#mermaid-svg-iQG7zzNYbwMPjLE9 .node.clickable{cursor:pointer;}#mermaid-svg-iQG7zzNYbwMPjLE9 .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .arrowheadPath{fill:#333333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-iQG7zzNYbwMPjLE9 .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-iQG7zzNYbwMPjLE9 .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-iQG7zzNYbwMPjLE9 .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster text{fill:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 .cluster span{color:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-iQG7zzNYbwMPjLE9 .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-iQG7zzNYbwMPjLE9 rect.text{fill:none;stroke-width:0;}#mermaid-svg-iQG7zzNYbwMPjLE9 .icon-shape,#mermaid-svg-iQG7zzNYbwMPjLE9 .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-iQG7zzNYbwMPjLE9 .icon-shape p,#mermaid-svg-iQG7zzNYbwMPjLE9 .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-iQG7zzNYbwMPjLE9 .icon-shape rect,#mermaid-svg-iQG7zzNYbwMPjLE9 .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-iQG7zzNYbwMPjLE9 .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-iQG7zzNYbwMPjLE9 .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-iQG7zzNYbwMPjLE9 :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165: \u6211\u7231\u5b66\u4e60<\/p>\n<p><\/span><\/p>\n<p>         <span style=\"color:#fff !important\" class=\"nodeLabel\"><\/p>\n<p>Transformer \u9ed1\u76d2<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa: I love study<\/p>\n<p><\/span><\/p>\n<h4>2. \u6380\u5f00\u76d6\u5b50&#xff1a;\u5de6\u53f3\u4e92\u640f\u7684\u201c\u4e50\u9ad8\u5854\u201d<\/h4>\n<p>\u73b0\u5728&#xff0c;\u6211\u4eec\u628a\u9ed1\u76d2\u5b50\u7684\u76d6\u5b50\u6380\u5f00\u3002\u4f60\u4f1a\u53d1\u73b0\u91cc\u9762\u5e76\u4e0d\u662f\u6d51\u7136\u4e00\u4f53\u7684&#xff0c;\u800c\u662f\u5206\u6210\u4e86\u5de6\u53f3\u4e24\u5927\u9635\u8425&#xff0c;\u5c31\u50cf\u4e24\u5ea7\u4e50\u9ad8\u79ef\u6728\u5854\u3002<\/p>\n<p>\u8fd9\u5c31\u662f\u5927\u540d\u9f0e\u9f0e\u7684 Encoder-Decoder&#xff08;\u7f16\u7801\u5668-\u89e3\u7801\u5668&#xff09; \u67b6\u6784\u3002<\/p>\n<ul>\n<li>\u5de6\u8fb9\u7684\u5854&#xff08;Encoder \u7f16\u7801\u5668&#xff09;&#xff1a;\u8d1f\u8d23\u201c\u8f93\u5165\u201d\u3002\u5b83\u7684\u5de5\u4f5c\u662f\u201c\u7406\u89e3\u201d\u3002\u5b83\u8981\u628a\u4f60\u90a3\u53e5\u4e2d\u6587\u56bc\u788e\u4e86\u3001\u5403\u900f\u4e86&#xff0c;\u8f6c\u5316\u6210\u4e00\u56e2\u673a\u5668\u80fd\u770b\u61c2\u7684\u201c\u8bed\u4e49\u5411\u91cf\u201d\u3002<\/li>\n<li>\u53f3\u8fb9\u7684\u5854&#xff08;Decoder \u89e3\u7801\u5668&#xff09;&#xff1a;\u8d1f\u8d23\u201c\u8f93\u51fa\u201d\u3002\u5b83\u7684\u5de5\u4f5c\u662f\u201c\u751f\u6210\u201d\u3002\u5b83\u62ff\u7740\u5de6\u8fb9\u7ed9\u5b83\u7684\u201c\u8bed\u4e49\u201d&#xff0c;\u6839\u636e\u4e0a\u6587&#xff0c;\u4e00\u4e2a\u5b57\u4e00\u4e2a\u5b57\u5730\u8e66\u51fa\u82f1\u6587\u3002<\/li>\n<\/ul>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u8001\u5b66\u7a76\u4e0e\u7545\u9500\u4e66\u4f5c\u5bb6<\/h5>\n<p>\u4e3a\u4e86\u5f7b\u5e95\u7406\u89e3\u8fd9\u4e24\u5144\u5f1f\u7684\u5173\u7cfb&#xff0c;\u6211\u4eec\u53ef\u4ee5\u8fd9\u6837\u6bd4\u55bb&#xff1a;<\/p>\n<ul>\n<li>Encoder&#xff08;\u8001\u5b66\u7a76&#xff09;&#xff1a;\u4ed6\u535a\u53e4\u901a\u4eca&#xff0c;\u9605\u8bfb\u7406\u89e3\u80fd\u529b\u6ee1\u5206\u3002\u4f60\u7ed9\u4ed6\u4e00\u7bc7\u6587\u7ae0&#xff0c;\u4ed6\u770b\u5b8c\u540e\u4e0d\u8bf4\u8bdd&#xff0c;\u800c\u662f\u9ed8\u9ed8\u5730\u753b\u51fa\u4e00\u5f20\u201c\u601d\u7ef4\u5bfc\u56fe\u201d&#xff08;Context Vector&#xff09;\u3002\u8fd9\u5f20\u56fe\u91cc\u5305\u542b\u4e86\u6587\u7ae0\u91cc\u6240\u6709\u7684\u77e5\u8bc6\u70b9\u3001\u903b\u8f91\u5173\u7cfb\u548c\u60c5\u611f\u8272\u5f69\u3002<\/li>\n<li>Decoder&#xff08;\u7545\u9500\u4e66\u4f5c\u5bb6&#xff09;&#xff1a;\u4ed6\u6587\u7b14\u6781\u597d&#xff0c;\u8d1f\u8d23\u6839\u636e\u90a3\u5f20\u201c\u601d\u7ef4\u5bfc\u56fe\u201d\u6765\u5199\u6587\u7ae0\u3002\u4f46\u4ed6\u6709\u4e2a\u4e60\u60ef&#xff0c;\u5199\u5b57\u5fc5\u987b\u4e00\u4e2a\u4e00\u4e2a\u5199\u3002\u4ed6\u4e00\u8fb9\u770b\u7740\u8001\u5b66\u7a76\u7ed9\u7684\u5bfc\u56fe&#xff08;Cross Attention&#xff09;&#xff0c;\u4e00\u8fb9\u770b\u7740\u81ea\u5df1\u521a\u521a\u5199\u4e0b\u7684\u5b57&#xff08;Masked Self-Attention&#xff09;&#xff0c;\u4ee5\u6b64\u6765\u51b3\u5b9a\u4e0b\u4e00\u4e2a\u5b57\u5199\u4ec0\u4e48\u3002<\/li>\n<\/ul>\n<p>\u6ce8\u610f&#xff1a;\u8fd9\u4e24\u5ea7\u5854\u4e0d\u4ec5\u662f\u5206\u5f00\u7684&#xff0c;\u800c\u4e14\u662f\u53ef\u4ee5\u5806\u53e0\u7684\u3002 \u5728 Google \u6700\u521d\u7684\u8bba\u6587\u4e2d&#xff0c;Encoder \u548c Decoder \u5404\u81ea\u5806\u4e86 6 \u5c42\u3002\u4e3a\u4ec0\u4e48\u662f 6 \u5c42&#xff1f;\u6ca1\u5565\u7279\u522b\u539f\u56e0&#xff0c;\u53ef\u80fd\u662f Google \u5de5\u7a0b\u5e08\u89c9\u5f97 6 \u8fd9\u4e2a\u6570\u5b57\u5409\u5229&#xff08;\u5176\u5b9e\u662f\u5b9e\u9a8c\u51fa\u6765\u7684\u8d85\u53c2\u6570&#xff09;\u3002\u8fd9\u5c31\u597d\u6bd4\u8001\u5b66\u7a76\u4e0d\u662f\u4e00\u4e2a\u4eba&#xff0c;\u800c\u662f 6 \u4e2a\u8001\u5b66\u7a76\u7ec4\u6210\u7684\u7684\u4e13\u5bb6\u7ec4&#xff0c;\u4e00\u5c42\u5c42\u5730\u4f20\u9012\u4fe1\u606f&#xff0c;\u7406\u89e3\u5f97\u8d8a\u6765\u8d8a\u6df1\u3002<\/p>\n<h4>3. \u5b8f\u89c2\u67b6\u6784\u56fe\u89e3<\/h4>\n<p>\u6211\u4eec\u7528 Mermaid \u6765\u753b\u4e00\u4e0b\u8fd9\u4e2a\u201c\u4e50\u9ad8\u5854\u201d\u7ed3\u6784\u3002\u8bf7\u6ce8\u610f\u4e2d\u95f4\u90a3\u6761\u6a2a\u8de8\u5de6\u53f3\u7684\u8fde\u7ebf&#xff0c;\u90a3\u5c31\u662f**\u201c\u8001\u5b66\u7a76\u201d\u628a\u201c\u601d\u7ef4\u5bfc\u56fe\u201d\u9012\u7ed9\u201c\u4f5c\u5bb6\u201d\u7684\u5173\u952e\u65f6\u523b**\u3002<\/p>\n<p>  #mermaid-svg-vbvADCBfv6pIHHzF{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-vbvADCBfv6pIHHzF .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-vbvADCBfv6pIHHzF .error-icon{fill:#552222;}#mermaid-svg-vbvADCBfv6pIHHzF .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-vbvADCBfv6pIHHzF .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-vbvADCBfv6pIHHzF .marker{fill:#333333;stroke:#333333;}#mermaid-svg-vbvADCBfv6pIHHzF .marker.cross{stroke:#333333;}#mermaid-svg-vbvADCBfv6pIHHzF svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-vbvADCBfv6pIHHzF p{margin:0;}#mermaid-svg-vbvADCBfv6pIHHzF .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-vbvADCBfv6pIHHzF .cluster-label text{fill:#333;}#mermaid-svg-vbvADCBfv6pIHHzF .cluster-label span{color:#333;}#mermaid-svg-vbvADCBfv6pIHHzF .cluster-label span p{background-color:transparent;}#mermaid-svg-vbvADCBfv6pIHHzF .label text,#mermaid-svg-vbvADCBfv6pIHHzF span{fill:#333;color:#333;}#mermaid-svg-vbvADCBfv6pIHHzF .node rect,#mermaid-svg-vbvADCBfv6pIHHzF .node circle,#mermaid-svg-vbvADCBfv6pIHHzF .node ellipse,#mermaid-svg-vbvADCBfv6pIHHzF .node polygon,#mermaid-svg-vbvADCBfv6pIHHzF .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-vbvADCBfv6pIHHzF .rough-node .label text,#mermaid-svg-vbvADCBfv6pIHHzF .node .label text,#mermaid-svg-vbvADCBfv6pIHHzF .image-shape .label,#mermaid-svg-vbvADCBfv6pIHHzF .icon-shape .label{text-anchor:middle;}#mermaid-svg-vbvADCBfv6pIHHzF .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-vbvADCBfv6pIHHzF .rough-node .label,#mermaid-svg-vbvADCBfv6pIHHzF .node .label,#mermaid-svg-vbvADCBfv6pIHHzF .image-shape .label,#mermaid-svg-vbvADCBfv6pIHHzF .icon-shape .label{text-align:center;}#mermaid-svg-vbvADCBfv6pIHHzF .node.clickable{cursor:pointer;}#mermaid-svg-vbvADCBfv6pIHHzF .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-vbvADCBfv6pIHHzF .arrowheadPath{fill:#333333;}#mermaid-svg-vbvADCBfv6pIHHzF .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-vbvADCBfv6pIHHzF .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-vbvADCBfv6pIHHzF .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-vbvADCBfv6pIHHzF .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-vbvADCBfv6pIHHzF .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-vbvADCBfv6pIHHzF .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-vbvADCBfv6pIHHzF .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-vbvADCBfv6pIHHzF .cluster text{fill:#333;}#mermaid-svg-vbvADCBfv6pIHHzF .cluster span{color:#333;}#mermaid-svg-vbvADCBfv6pIHHzF div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-vbvADCBfv6pIHHzF .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-vbvADCBfv6pIHHzF rect.text{fill:none;stroke-width:0;}#mermaid-svg-vbvADCBfv6pIHHzF .icon-shape,#mermaid-svg-vbvADCBfv6pIHHzF .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-vbvADCBfv6pIHHzF .icon-shape p,#mermaid-svg-vbvADCBfv6pIHHzF .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-vbvADCBfv6pIHHzF .icon-shape rect,#mermaid-svg-vbvADCBfv6pIHHzF .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-vbvADCBfv6pIHHzF .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-vbvADCBfv6pIHHzF .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-vbvADCBfv6pIHHzF :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Decoder Stack (\u89e3\u7801\u5668\u5806\u53e0)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Encoder Stack (\u7f16\u7801\u5668\u5806\u53e0)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u4f20\u9012\u8bed\u4e49\u8bb0\u5fc6 (K, V)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u4f20\u9012\u8bed\u4e49\u8bb0\u5fc6 (K, V)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u4f20\u9012\u8bed\u4e49\u8bb0\u5fc6 (K, V)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/p>\n<p>\u4f20\u9012\u8bed\u4e49\u8bb0\u5fc6 (K, V)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"edgeLabel\"><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165: \u6211 \u7231 \u5b66 \u4e60<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Encoder Layer 1<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Encoder Layer 2<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>&#8230;<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Encoder Layer 6  (\u8001\u5b66\u7a76\u7684\u6700\u7ec8\u7406\u89e3)<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165: Start I love &#8230;<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Decoder Layer 1<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Decoder Layer 2<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>&#8230;<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Decoder Layer 6<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>Linear &#043; Softmax<\/p>\n<p><\/span><\/p>\n<p>         <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa\u6982\u7387: study<\/p>\n<p><\/span><\/p>\n<h4>4. \u8fd9\u91cc\u7684\u201c\u5751\u201d\u548c\u201c\u5f69\u86cb\u201d<\/h4>\n<p>\u5728\u7ee7\u7eed\u6df1\u5165\u4e4b\u524d&#xff0c;\u6709\u4e24\u4e2a\u6982\u5ff5\u4f60\u9700\u8981\u5148\u6709\u4e2a\u5370\u8c61&#xff0c;\u9632\u6b62\u540e\u9762\u8ff7\u8def&#xff1a;<\/p>\n<li>\n<p>\u5e76\u884c\u4e0e\u4e32\u884c\u7684\u5206\u88c2&#xff1a;<\/p>\n<ul>\n<li>Encoder \u662f\u5b8c\u5168\u5e76\u884c\u7684\u3002\u5b83\u662f\u4e00\u773c\u770b\u5b8c\u6574\u4e2a\u53e5\u5b50&#xff08;\u6211\u7231\u5b66\u4e60&#xff09;&#xff0c;\u8fd9 4 \u4e2a\u5b57\u540c\u65f6\u8fdb\u5165\u6a21\u578b\u5904\u7406\u3002<\/li>\n<li>Decoder \u5728\u63a8\u7406&#xff08;\u7ffb\u8bd1&#xff09;\u65f6&#xff0c;\u4f9d\u7136\u662f\u4e32\u884c\u7684\u3002\u5b83\u53ea\u80fd\u5148\u751f\u6210 I&#xff0c;\u518d\u751f\u6210 love&#xff0c;\u518d\u751f\u6210 study\u3002\u56e0\u4e3a\u5b83\u4e0d\u77e5\u9053\u540e\u9762\u4f1a\u5199\u5565\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>\u5bb6\u65cf\u5206\u5bb6&#xff08;\u9884\u544a&#xff09;&#xff1a; \u73b0\u5728\u7684 AI \u5c4a&#xff0c;\u539f\u59cb\u7684 Encoder-Decoder \u7ed3\u6784\u5176\u5b9e\u7528\u5f97\u53d8\u5c11\u4e86&#xff0c;\u5927\u5bb6\u5f00\u59cb\u5206\u5bb6\u4ea7&#xff1a;<\/p>\n<ul>\n<li>BERT \u5bb6\u65cf&#xff1a;\u53ea\u62ff\u8d70\u4e86\u5de6\u8fb9\u7684 Encoder\u3002\u56e0\u4e3a\u5b83\u4e0d\u9700\u8981\u8bf4\u8bdd&#xff0c;\u53ea\u9700\u8981\u201c\u7406\u89e3\u201d\u8bed\u8a00&#xff08;\u505a\u586b\u7a7a\u9898\u3001\u9605\u8bfb\u7406\u89e3&#xff09;\u3002<\/li>\n<li>GPT \u5bb6\u65cf&#xff1a;\u53ea\u62ff\u8d70\u4e86\u53f3\u8fb9\u7684 Decoder\u3002\u56e0\u4e3a\u5b83\u4e0d\u9700\u8981\u7406\u89e3\u7279\u5b9a\u7684\u8f93\u5165\u6587\u672c&#xff0c;\u5b83\u53ea\u7ba1\u5728\u90a3\u513f\u201c\u778e\u7f16\u201d\u4e0b\u4e00\u53e5&#xff0c;\u4e00\u8def\u751f\u6210\u4e0b\u53bb\u3002<\/li>\n<\/ul>\n<\/li>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<p>\u8fd9\u4e00\u7ae0\u6211\u4eec\u4e0d\u9700\u8981\u641e\u61c2\u5185\u90e8\u8ba1\u7b97&#xff0c;\u53ea\u9700\u8981\u8bb0\u4f4f&#xff1a;<\/p>\n<li>Transformer &#061; Encoder&#xff08;\u7406\u89e3\u8005&#xff09; &#043; Decoder&#xff08;\u751f\u6210\u8005&#xff09;\u3002<\/li>\n<li>\u5b83\u4eec\u662f\u7531\u591a\u5c42\u7f51\u7edc\u5806\u53e0\u800c\u6210\u7684&#xff08;\u50cf\u4e50\u9ad8\u79ef\u6728&#xff09;\u3002<\/li>\n<li>\u4fe1\u606f\u6d41\u662f\u4ece Encoder \u6d41\u5411 Decoder\u3002<\/li>\n<p>\u73b0\u5728&#xff0c;\u6211\u4eec\u8981\u628a\u955c\u5934\u63a8\u8fdb&#xff0c;\u8d70\u8fdb Encoder \u7684\u7b2c\u4e00\u5c42\u3002\u4f46\u5728\u8ba1\u7b97\u4e4b\u524d&#xff0c;\u673a\u5668\u9762\u4e34\u4e00\u4e2a\u5de8\u5927\u7684\u95ee\u9898&#xff1a;\u673a\u5668\u4e0d\u8ba4\u8bc6\u6c49\u5b57&#xff0c;\u4e5f\u4e0d\u8ba4\u8bc6\u82f1\u6587&#xff0c;\u5b83\u53ea\u8ba4\u8bc6\u6570\u5b57\u3002<\/p>\n<p>\u800c\u4e14&#xff0c;\u6211\u4eec\u5728\u7b2c\u4e00\u7ae0\u8bf4\u8fc7&#xff0c;Transformer \u5e76\u884c\u5904\u7406\u5bfc\u81f4\u5b83\u6ca1\u6709\u201c\u65f6\u95f4\u6982\u5ff5\u201d\u3002\u5982\u679c\u4e0d\u505a\u7279\u6b8a\u5904\u7406&#xff0c;\u673a\u5668\u4f1a\u8ba4\u4e3a\u201c\u8bfb\u4e66\u201d\u548c\u201c\u4e66\u8bfb\u201d\u662f\u4e00\u56de\u4e8b\u3002<\/p>\n<hr \/>\n<p>\u5728\u524d\u4e24\u7ae0\u6211\u4eec\u5439\u4e86\u534a\u5929 Transformer \u7684\u201c\u5e76\u884c\u8ba1\u7b97\u201d\u6709\u591a\u725b&#xff0c;\u4f46\u8fd9\u5176\u5b9e\u5e26\u6765\u4e86\u4e00\u4e2a\u5de8\u5927\u7684\u526f\u4f5c\u7528&#xff1a;\u5b83\u662f\u4e2a\u4e0d\u6298\u4e0d\u6263\u7684\u201c\u8def\u75f4\u201d\u3002<\/p>\n<p>\u56e0\u4e3a\u5b83\u662f\u628a\u6240\u6709\u8bcd\u4e00\u53e3\u6c14\u5403\u8fdb\u53bb\u7684&#xff0c;\u6240\u4ee5\u5728\u5b83\u773c\u91cc&#xff0c;\u201c\u6211\u6253\u4f60\u201d\u548c\u201c\u4f60\u6253\u6211\u201d\u662f\u4e00\u6a21\u4e00\u6837\u7684\u2014\u2014\u4e0d\u5c31\u662f\u201c\u6211\u3001\u4f60\u3001\u6253\u201d\u8fd9\u4e09\u4e2a\u8bcd\u7684\u7ec4\u5408\u561b&#xff01;<\/p>\n<p>\u4e3a\u4e86\u6cbb\u597d\u8fd9\u4e2a Bug&#xff0c;\u6211\u4eec\u9700\u8981\u7ed9\u6bcf\u4e2a\u8bcd\u88c5\u4e0a\u8eab\u4efd\u8bc1&#xff08;Embedding&#xff09;\u548cGPS&#xff08;\u4f4d\u7f6e\u7f16\u7801&#xff09;\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u4e09\u7ae0&#xff1a;\u5355\u8bcd\u7684 GPS\u2014\u2014Embedding \u4e0e\u4f4d\u7f6e\u7f16\u7801<\/h2>\n<h4>1. \u673a\u5668\u7684\u5b57\u5178&#xff1a;Embedding&#xff08;\u8bcd\u5d4c\u5165&#xff09;<\/h4>\n<p>\u9996\u5148&#xff0c;\u4e0d\u7ba1\u662f Transformer \u8fd8\u662f\u4ee5\u524d\u7684 RNN&#xff0c;\u673a\u5668\u5176\u5b9e\u6839\u672c\u4e0d\u8ba4\u8bc6\u201cApple\u201d\u8fd9\u4e2a\u8bcd\u3002\u5728\u5b83\u770b\u6765&#xff0c;\u8fd9\u53ea\u662f\u4e00\u4e32 ASCII \u7801\u3002<\/p>\n<p>\u6211\u4eec\u9700\u8981\u628a\u6587\u5b57\u53d8\u6210\u673a\u5668\u80fd\u8ba1\u7b97\u7684\u6570\u5b57\u3002\u4f46\u4e0d\u80fd\u7b80\u5355\u5730\u628a Apple \u53d8\u6210 1&#xff0c;Banana \u53d8\u6210 2\u3002\u56e0\u4e3a 1 \u548c 2 \u5728\u6570\u5b66\u4e0a\u6328\u5f97\u5f88\u8fd1&#xff0c;\u4f46\u82f9\u679c\u548c\u9999\u8549\u5728\u8bed\u4e49\u4e0a\u53ef\u80fd\u5dee\u5f88\u8fdc&#xff08;\u6216\u8005\u5f88\u8fd1&#xff09;\u3002<\/p>\n<p>\u6211\u4eec\u4f7f\u7528\u7684\u662f Embedding&#xff08;\u8bcd\u5d4c\u5165&#xff09;\u3002<\/p>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u5355\u8bcd\u7684\u201cDNA \u56fe\u8c31\u201d<\/h5>\n<p>\u60f3\u8c61\u4e00\u4e0b&#xff0c;\u6211\u4eec\u628a\u5b57\u5178\u91cc\u7684\u6bcf\u4e00\u4e2a\u8bcd&#xff0c;\u90fd\u5bf9\u5e94\u5230\u4e00\u4e2a\u957f\u957f\u7684\u6570\u5b57\u5217\u8868&#xff08;\u5411\u91cf&#xff09;\u3002\u8fd9\u4e2a\u5217\u8868\u901a\u5e38\u6709 512 \u7ef4&#xff08;\u6216\u8005 768\u30011024 \u7ef4&#xff09;\u3002<\/p>\n<p>\u6bcf\u4e00\u7ef4\u6570\u5b57\u90fd\u4ee3\u8868\u8fd9\u4e2a\u8bcd\u7684\u4e00\u79cd\u201c\u6f5c\u5728\u7279\u5f81\u201d\u3002\u867d\u7136\u673a\u5668\u4e0d\u4f1a\u660e\u8bf4&#xff0c;\u4f46\u6211\u4eec\u53ef\u4ee5\u8fd9\u6837\u8111\u8865&#xff1a;<\/p>\n<ul>\n<li>\u7b2c 1 \u7ef4\u4ee3\u8868\u201c\u662f\u4e0d\u662f\u540d\u8bcd\u201d<\/li>\n<li>\u7b2c 2 \u7ef4\u4ee3\u8868\u201c\u662f\u4e0d\u662f\u6d3b\u7269\u201d<\/li>\n<li>\u7b2c 3 \u7ef4\u4ee3\u8868\u201c\u738b\u5ba4\u7a0b\u5ea6\u201d<\/li>\n<li>\u2026<\/li>\n<\/ul>\n<p>\u6bd4\u5982&#xff1a;<\/p>\n<ul>\n<li>King (\u56fd\u738b) -&gt; [0.9, 0.9, 0.99, &#8230;] (\u540d\u8bcd&#xff0c;\u6d3b\u4eba&#xff0c;\u6781\u5176\u738b\u5ba4)<\/li>\n<li>Queen (\u5973\u738b) -&gt; [0.9, 0.9, 0.99, &#8230;] (\u8ddf\u56fd\u738b\u5f88\u50cf&#xff0c;\u53ea\u6709\u4ee3\u8868\u6027\u522b\u7684\u67d0\u4e00\u7ef4\u4e0d\u540c)<\/li>\n<li>Apple (\u82f9\u679c) -&gt; [0.9, 0.1, 0.00, &#8230;] (\u540d\u8bcd&#xff0c;\u690d\u7269&#xff0c;\u8ddf\u738b\u5ba4\u6ca1\u534a\u6bdb\u94b1\u5173\u7cfb)<\/li>\n<\/ul>\n<p>\u8fd9\u5c31\u662f Embedding&#xff1a;\u628a\u51b0\u51b7\u7684\u5355\u8bcd&#xff0c;\u53d8\u6210\u4e86\u6709\u4e30\u5bcc\u8bed\u4e49\u5185\u6db5\u7684\u6570\u5b66\u5411\u91cf\u3002<\/p>\n<hr \/>\n<h4>2. \u6cbb\u6108\u8def\u75f4&#xff1a;Positional Encoding&#xff08;\u4f4d\u7f6e\u7f16\u7801&#xff09;<\/h4>\n<p>Embedding \u641e\u5b9a\u540e&#xff0c;\u771f\u6b63\u7684\u9ebb\u70e6\u6765\u4e86\u3002<\/p>\n<p>RNN \u662f\u6309\u987a\u5e8f\u8bfb\u7684&#xff0c;\u8bfb\u5b8c\u7b2c\u4e00\u4e2a\u624d\u80fd\u8bfb\u7b2c\u4e8c\u4e2a&#xff0c;\u6240\u4ee5\u5b83\u5929\u7136\u77e5\u9053\u4f4d\u7f6e\u4fe1\u606f\u3002\u4f46 Transformer \u662f\u5e76\u884c\u7684&#xff08;\u8fd8\u8bb0\u5f97\u7b2c\u4e00\u7ae0\u9e21\u5c3e\u9152\u4f1a\u7684\u6bd4\u55bb\u5417&#xff1f;&#xff09;\u3002<\/p>\n<p>\u5982\u679c\u6211\u4eec\u76f4\u63a5\u628a Embedding(&#034;\u6211&#034;)&#xff0c;Embedding(&#034;\u7231&#034;)&#xff0c;Embedding(&#034;\u4f60&#034;) \u6254\u8fdb\u53bb&#xff0c;\u6a21\u578b\u53ea\u77e5\u9053\u8fd9\u4e09\u4e2a\u8bcd\u51fa\u73b0\u4e86&#xff0c;\u5b8c\u5168\u4e0d\u77e5\u9053\u8c01\u5728\u8c01\u524d\u9762\u3002<\/p>\n<p>\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898&#xff0c;\u806a\u660e\u7684\u5de5\u7a0b\u5e08\u60f3\u4e86\u4e00\u62db&#xff1a;\u7ed9 Embedding \u5411\u91cf\u201c\u52a0\u201d\u70b9\u6599\u3002<\/p>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u7ed9\u5ea7\u4f4d\u7f16\u53f7<\/h5>\n<p>\u60f3\u8c61\u4e00\u7fa4\u4eba&#xff08;\u5355\u8bcd&#xff09;\u4e71\u54c4\u54c4\u5730\u6324\u8fdb\u4e00\u4e2a\u623f\u95f4&#xff08;Transformer&#xff09;\u3002 \u4e3a\u4e86\u4e0d\u4e71\u5957&#xff0c;\u6211\u4eec\u5728\u6bcf\u4e2a\u4eba\u8fdb\u95e8\u65f6&#xff0c;\u5f3a\u884c\u7ed9\u4ed6\u7684\u8863\u670d\u4e0a\u8d34\u4e00\u4e2a\u53f7\u7801\u724c&#xff0c;\u6216\u8005\u53d1\u4e00\u4e2a\u5e26\u6709\u4f4d\u7f6e\u4fe1\u606f\u7684 GPS \u4fe1\u53f7\u3002<\/p>\n<ul>\n<li>\u5bf9\u4e8e\u7b2c\u4e00\u4e2a\u8bcd\u201c\u6211\u201d&#xff0c;\u6211\u4eec\u7ed9\u5b83\u52a0\u4e0a <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          P<\/p>\n<p>          1<\/p>\n<p>        P_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">P<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u4f4d\u7f6e\u4fe1\u53f7\u3002<\/li>\n<li>\u5bf9\u4e8e\u7b2c\u4e8c\u4e2a\u8bcd\u201c\u7231\u201d&#xff0c;\u6211\u4eec\u7ed9\u5b83\u52a0\u4e0a <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          P<\/p>\n<p>          2<\/p>\n<p>        P_2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">P<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u4f4d\u7f6e\u4fe1\u53f7\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u6837&#xff0c;\u5373\u4f7f\u5927\u5bb6\u6df7\u5728\u4e00\u8d77&#xff0c;\u6a21\u578b\u53ea\u8981\u770b\u4e00\u773c\u4fe1\u53f7&#xff0c;\u5c31\u77e5\u9053&#xff1a;\u201c\u54e6&#xff0c;\u867d\u7136\u4f60\u4eec\u662f\u4e00\u8d77\u8fdb\u6765\u7684&#xff0c;\u4f46\u2018\u6211\u2019\u662f\u6392\u5728\u7b2c\u4e00\u53f7\u5ea7\u4f4d\u7684\u3002\u201d<\/p>\n<h5>\u26a0\ufe0f \u8fd9\u91cc\u7684\u6838\u5fc3\u9a9a\u64cd\u4f5c&#xff1a;\u662f\u76f8\u52a0&#xff08;Add&#xff09;&#xff0c;\u4e0d\u662f\u62fc\u63a5&#xff08;Concat&#xff09;<\/h5>\n<p>\u521d\u5b66\u8005\u5f88\u5bb9\u6613\u89c9\u5f97&#xff0c;\u4f4d\u7f6e\u4fe1\u606f\u5e94\u8be5\u50cf\u6302\u62d6\u8f66\u4e00\u6837\u8ddf\u5728\u8bcd\u5411\u91cf\u540e\u9762&#xff08;\u62fc\u63a5&#xff09;\u3002\u4f46 Transformer \u5c45\u7136\u662f\u76f4\u63a5\u628a\u4f4d\u7f6e\u5411\u91cf\u201c\u52a0\u201d\u5230\u4e86\u8bcd\u5411\u91cf\u4e0a\u9762&#xff01;<\/p>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         I<\/p>\n<p>         n<\/p>\n<p>         p<\/p>\n<p>         u<\/p>\n<p>         t<\/p>\n<p>         &#061;<\/p>\n<p>         E<\/p>\n<p>         m<\/p>\n<p>         b<\/p>\n<p>         e<\/p>\n<p>         d<\/p>\n<p>         d<\/p>\n<p>         i<\/p>\n<p>         n<\/p>\n<p>         g<\/p>\n<p>         &#043;<\/p>\n<p>         P<\/p>\n<p>         o<\/p>\n<p>         s<\/p>\n<p>         i<\/p>\n<p>         t<\/p>\n<p>         i<\/p>\n<p>         o<\/p>\n<p>         n<\/p>\n<p>         a<\/p>\n<p>         l<\/p>\n<p>         E<\/p>\n<p>         n<\/p>\n<p>         c<\/p>\n<p>         o<\/p>\n<p>         d<\/p>\n<p>         i<\/p>\n<p>         n<\/p>\n<p>         g<\/p>\n<p>        Input &#061; Embedding &#043; PositionalEncoding<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0785em\">I<\/span><span class=\"mord mathnormal\">n<\/span><span class=\"mord mathnormal\">p<\/span><span class=\"mord mathnormal\">u<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8889em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0576em\">E<\/span><span class=\"mord mathnormal\">mb<\/span><span class=\"mord mathnormal\">e<\/span><span class=\"mord mathnormal\">dd<\/span><span class=\"mord mathnormal\">in<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">g<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8889em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">P<\/span><span class=\"mord mathnormal\">os<\/span><span class=\"mord mathnormal\">i<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mord mathnormal\">i<\/span><span class=\"mord mathnormal\">o<\/span><span class=\"mord mathnormal\">na<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0576em\">lE<\/span><span class=\"mord mathnormal\">n<\/span><span class=\"mord mathnormal\">co<\/span><span class=\"mord mathnormal\">d<\/span><span class=\"mord mathnormal\">in<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">g<\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p>\u8fd9\u5c31\u50cf\u662f&#xff1a;<\/p>\n<ul>\n<li>\u8bcd\u4e49\u662f\u7ea2\u8272\u989c\u6599\u3002<\/li>\n<li>\u4f4d\u7f6e\u662f\u84dd\u8272\u989c\u6599\u3002<\/li>\n<li>Transformer \u5b9e\u9645\u4e0a\u628a\u5b83\u4eec\u6df7\u6210\u4e86\u7d2b\u8272\u989c\u6599\u8f93\u8fdb\u53bb\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u96be\u9053\u4e0d\u4f1a\u7834\u574f\u539f\u6765\u7684\u8bcd\u4e49\u5417&#xff1f; \u7b54\u6848\u662f&#xff1a;\u4f1a\u6709\u4e00\u70b9\u70b9\u5e72\u6270&#xff0c;\u4f46\u6ca1\u5173\u7cfb\u3002\u56e0\u4e3a\u8bcd\u5411\u91cf\u7684\u7ef4\u5ea6\u5f88\u9ad8&#xff08;512\u7ef4&#xff09;&#xff0c;\u800c\u4f4d\u7f6e\u7f16\u7801\u7684\u6570\u503c\u5f88\u72ec\u7279\u3002\u5728\u9ad8\u7ef4\u7a7a\u95f4\u91cc&#xff0c;\u6a21\u578b\u80fd\u591f\u5b66\u4f1a\u628a\u201c\u8bcd\u4e49\u201d\u548c\u201c\u4f4d\u7f6e\u201d\u91cd\u65b0\u5206\u79bb\u5f00\u6765\u3002<\/p>\n<h4>3. \u6b63\u5f26\u6ce2\u7684\u9b54\u6cd5&#xff1a;\u4e3a\u4ec0\u4e48\u7528 Sin\/Cos&#xff1f;<\/h4>\n<p>Transformer \u7684\u4f4d\u7f6e\u7f16\u7801\u5e76\u4e0d\u662f\u7b80\u5355\u7684 1, 2, 3, 4 \u8fd9\u79cd\u6574\u6570\u3002\u56e0\u4e3a\u5982\u679c\u53e5\u5b50\u592a\u957f&#xff0c;\u6570\u503c\u4f1a\u53d8\u5f97\u65e0\u9650\u5927&#xff0c;\u628a\u8bcd\u4e49\u5b8c\u5168\u76d6\u8fc7\u53bb\u3002<\/p>\n<p>\u5b83\u4f7f\u7528\u7684\u662f\u6b63\u5f26\u548c\u4f59\u5f26\u51fd\u6570&#xff08;Sin\/Cos&#xff09;\u3002<\/p>\n<p>  #mermaid-svg-TYRwknC3AY4yCjnV{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-TYRwknC3AY4yCjnV .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-TYRwknC3AY4yCjnV .error-icon{fill:#552222;}#mermaid-svg-TYRwknC3AY4yCjnV .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-TYRwknC3AY4yCjnV .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-TYRwknC3AY4yCjnV .marker{fill:#333333;stroke:#333333;}#mermaid-svg-TYRwknC3AY4yCjnV .marker.cross{stroke:#333333;}#mermaid-svg-TYRwknC3AY4yCjnV svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-TYRwknC3AY4yCjnV p{margin:0;}#mermaid-svg-TYRwknC3AY4yCjnV .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-TYRwknC3AY4yCjnV .cluster-label text{fill:#333;}#mermaid-svg-TYRwknC3AY4yCjnV .cluster-label span{color:#333;}#mermaid-svg-TYRwknC3AY4yCjnV .cluster-label span p{background-color:transparent;}#mermaid-svg-TYRwknC3AY4yCjnV .label text,#mermaid-svg-TYRwknC3AY4yCjnV span{fill:#333;color:#333;}#mermaid-svg-TYRwknC3AY4yCjnV .node rect,#mermaid-svg-TYRwknC3AY4yCjnV .node circle,#mermaid-svg-TYRwknC3AY4yCjnV .node ellipse,#mermaid-svg-TYRwknC3AY4yCjnV .node polygon,#mermaid-svg-TYRwknC3AY4yCjnV .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-TYRwknC3AY4yCjnV .rough-node .label text,#mermaid-svg-TYRwknC3AY4yCjnV .node .label text,#mermaid-svg-TYRwknC3AY4yCjnV .image-shape .label,#mermaid-svg-TYRwknC3AY4yCjnV .icon-shape .label{text-anchor:middle;}#mermaid-svg-TYRwknC3AY4yCjnV .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-TYRwknC3AY4yCjnV .rough-node .label,#mermaid-svg-TYRwknC3AY4yCjnV .node .label,#mermaid-svg-TYRwknC3AY4yCjnV .image-shape .label,#mermaid-svg-TYRwknC3AY4yCjnV .icon-shape .label{text-align:center;}#mermaid-svg-TYRwknC3AY4yCjnV .node.clickable{cursor:pointer;}#mermaid-svg-TYRwknC3AY4yCjnV .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-TYRwknC3AY4yCjnV .arrowheadPath{fill:#333333;}#mermaid-svg-TYRwknC3AY4yCjnV .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-TYRwknC3AY4yCjnV .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-TYRwknC3AY4yCjnV .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-TYRwknC3AY4yCjnV .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-TYRwknC3AY4yCjnV .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-TYRwknC3AY4yCjnV .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-TYRwknC3AY4yCjnV .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-TYRwknC3AY4yCjnV .cluster text{fill:#333;}#mermaid-svg-TYRwknC3AY4yCjnV .cluster span{color:#333;}#mermaid-svg-TYRwknC3AY4yCjnV div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-TYRwknC3AY4yCjnV .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-TYRwknC3AY4yCjnV rect.text{fill:none;stroke-width:0;}#mermaid-svg-TYRwknC3AY4yCjnV .icon-shape,#mermaid-svg-TYRwknC3AY4yCjnV .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-TYRwknC3AY4yCjnV .icon-shape p,#mermaid-svg-TYRwknC3AY4yCjnV .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-TYRwknC3AY4yCjnV .icon-shape rect,#mermaid-svg-TYRwknC3AY4yCjnV .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-TYRwknC3AY4yCjnV .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-TYRwknC3AY4yCjnV .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-TYRwknC3AY4yCjnV :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4f4d\u7f6e\u7f16\u7801\u7684\u6ce2\u7eb9<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Sin\/Cos<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Sin\/Cos<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Sin\/Cos<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4f4d\u7f6e 1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6ce2\u7eb9\u5411\u91cf 1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4f4d\u7f6e 2<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6ce2\u7eb9\u5411\u91cf 2<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4f4d\u7f6e 100<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6ce2\u7eb9\u5411\u91cf 100<\/p>\n<p><\/span><\/p>\n<p>\u4f60\u53ef\u4ee5\u628a\u5b83\u7406\u89e3\u4e3a**\u201c\u591a\u91cd\u65f6\u949f\u201d**&#xff1a;<\/p>\n<ul>\n<li>\u5411\u91cf\u7684\u7b2c 1 \u7ef4\u50cf\u79d2\u9488&#xff0c;\u8d70\u5f97\u5feb&#xff08;\u9891\u7387\u9ad8&#xff09;\u3002<\/li>\n<li>\u5411\u91cf\u7684\u7b2c 2 \u7ef4\u50cf\u5206\u9488&#xff0c;\u8d70\u5f97\u6162\u3002<\/li>\n<li>\u5411\u91cf\u7684\u7b2c 3 \u7ef4\u50cf\u65f6\u9488&#xff0c;\u8d70\u5f97\u66f4\u6162\u3002<\/li>\n<\/ul>\n<p>\u6bcf\u4e2a\u4f4d\u7f6e&#xff08;Time Step&#xff09;&#xff0c;\u90fd\u5bf9\u5e94\u7740\u8fd9\u4e9b\u6307\u9488\u7684\u4e00\u4e2a\u552f\u4e00\u7ec4\u5408\u3002 \u8fd9\u6837&#xff0c;\u65e0\u8bba\u53e5\u5b50\u591a\u957f&#xff0c;\u4f4d\u7f6e\u7f16\u7801\u7684\u6570\u503c\u6c38\u8fdc\u5728 -1 \u5230 1 \u4e4b\u95f4\u9707\u8361&#xff0c;\u65e2\u7a33\u5b9a\u53c8\u72ec\u7279\u3002<\/p>\n<h4>4. \u6d41\u7a0b\u56fe\u89e3<\/h4>\n<p>\u8ba9\u6211\u4eec\u7528 Mermaid \u628a\u8fd9\u4e24\u4e2a\u6b65\u9aa4\u5408\u4e8c\u4e3a\u4e00&#xff0c;\u770b\u770b\u6570\u636e\u8fdb\u5165 Transformer \u5185\u90e8\u7684\u7b2c\u4e00\u6b65\u5230\u5e95\u957f\u5565\u6837\u3002<\/p>\n<p>  #mermaid-svg-to7xurLLWY4CfU44{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-to7xurLLWY4CfU44 .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-to7xurLLWY4CfU44 .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-to7xurLLWY4CfU44 .error-icon{fill:#552222;}#mermaid-svg-to7xurLLWY4CfU44 .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-to7xurLLWY4CfU44 .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-to7xurLLWY4CfU44 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-to7xurLLWY4CfU44 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-to7xurLLWY4CfU44 .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-to7xurLLWY4CfU44 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-to7xurLLWY4CfU44 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-to7xurLLWY4CfU44 .marker{fill:#333333;stroke:#333333;}#mermaid-svg-to7xurLLWY4CfU44 .marker.cross{stroke:#333333;}#mermaid-svg-to7xurLLWY4CfU44 svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-to7xurLLWY4CfU44 p{margin:0;}#mermaid-svg-to7xurLLWY4CfU44 .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-to7xurLLWY4CfU44 .cluster-label text{fill:#333;}#mermaid-svg-to7xurLLWY4CfU44 .cluster-label span{color:#333;}#mermaid-svg-to7xurLLWY4CfU44 .cluster-label span p{background-color:transparent;}#mermaid-svg-to7xurLLWY4CfU44 .label text,#mermaid-svg-to7xurLLWY4CfU44 span{fill:#333;color:#333;}#mermaid-svg-to7xurLLWY4CfU44 .node rect,#mermaid-svg-to7xurLLWY4CfU44 .node circle,#mermaid-svg-to7xurLLWY4CfU44 .node ellipse,#mermaid-svg-to7xurLLWY4CfU44 .node polygon,#mermaid-svg-to7xurLLWY4CfU44 .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-to7xurLLWY4CfU44 .rough-node .label text,#mermaid-svg-to7xurLLWY4CfU44 .node .label text,#mermaid-svg-to7xurLLWY4CfU44 .image-shape .label,#mermaid-svg-to7xurLLWY4CfU44 .icon-shape .label{text-anchor:middle;}#mermaid-svg-to7xurLLWY4CfU44 .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-to7xurLLWY4CfU44 .rough-node .label,#mermaid-svg-to7xurLLWY4CfU44 .node .label,#mermaid-svg-to7xurLLWY4CfU44 .image-shape .label,#mermaid-svg-to7xurLLWY4CfU44 .icon-shape .label{text-align:center;}#mermaid-svg-to7xurLLWY4CfU44 .node.clickable{cursor:pointer;}#mermaid-svg-to7xurLLWY4CfU44 .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-to7xurLLWY4CfU44 .arrowheadPath{fill:#333333;}#mermaid-svg-to7xurLLWY4CfU44 .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-to7xurLLWY4CfU44 .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-to7xurLLWY4CfU44 .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-to7xurLLWY4CfU44 .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-to7xurLLWY4CfU44 .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-to7xurLLWY4CfU44 .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-to7xurLLWY4CfU44 .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-to7xurLLWY4CfU44 .cluster text{fill:#333;}#mermaid-svg-to7xurLLWY4CfU44 .cluster span{color:#333;}#mermaid-svg-to7xurLLWY4CfU44 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-to7xurLLWY4CfU44 .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-to7xurLLWY4CfU44 rect.text{fill:none;stroke-width:0;}#mermaid-svg-to7xurLLWY4CfU44 .icon-shape,#mermaid-svg-to7xurLLWY4CfU44 .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-to7xurLLWY4CfU44 .icon-shape p,#mermaid-svg-to7xurLLWY4CfU44 .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-to7xurLLWY4CfU44 .icon-shape rect,#mermaid-svg-to7xurLLWY4CfU44 .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-to7xurLLWY4CfU44 .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-to7xurLLWY4CfU44 .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-to7xurLLWY4CfU44 :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165\u5904\u7406\u6d41\u7a0b<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u539f\u59cb\u6587\u672c: I love AI<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u5206\u8bcd (Tokenization)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Token IDs: [24, 56, 99]<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Embedding \u67e5\u8868<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8bcd\u5411\u91cf (Word Embeddings)  [Batch, Seq_Len, 512]<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\">&#043;<\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4f4d\u7f6e\u7f16\u7801\u751f\u6210 (Positional Encoding)  (Sin\/Cos Waves)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6700\u7ec8\u8f93\u5165\u5411\u91cf  (\u5e26\u7740\u4f4d\u7f6e\u4fe1\u606f\u7684\u8bcd\u5411\u91cf)<\/p>\n<p><\/span><\/p>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>Embedding \u8ba9\u673a\u5668\u7406\u89e3\u4e86\u8bcd\u4e49&#xff08;\u56fd\u738b\u548c\u5973\u738b\u5f88\u50cf&#xff09;\u3002<\/li>\n<li>Positional Encoding \u8ba9\u673a\u5668\u7406\u89e3\u4e86\u987a\u5e8f&#xff08;\u7531\u4e8e\u5e76\u884c\u8ba1\u7b97\u4e22\u5931\u4e86\u987a\u5e8f&#xff0c;\u9700\u8981\u4eba\u5de5\u8865\u56de\u6765&#xff09;\u3002<\/li>\n<li>\u5b83\u4eec\u662f\u901a\u8fc7\u6570\u5b66\u76f8\u52a0\u7684\u65b9\u5f0f\u878d\u5408\u7684\u3002<\/li>\n<p>\u73b0\u5728&#xff0c;\u6570\u636e\u5df2\u7ecf\u51c6\u5907\u597d\u4e86&#xff01;\u5b83\u65e2\u5305\u542b\u4e86\u201c\u662f\u4ec0\u4e48\u610f\u601d\u201d&#xff0c;\u4e5f\u5305\u542b\u4e86\u201c\u5728\u4ec0\u4e48\u4f4d\u7f6e\u201d\u3002<\/p>\n<hr \/>\n<p>\u5982\u679c\u8bf4 Embedding \u662f\u7ed9 Transformer \u4f9b\u8840\u7684\u8840\u7ba1&#xff0c;\u90a3 Self-Attention&#xff08;\u81ea\u6ce8\u610f\u529b\u673a\u5236&#xff09; \u5c31\u662f\u5b83\u7684\u5fc3\u810f\u3002\u8fd9\u4e5f\u662f\u8fd9\u7bc7\u8bba\u6587\u6807\u9898\u300aAttention Is All You Need\u300b\u91cc\u90a3\u4e2a\u4ef7\u503c\u5343\u91d1\u7684\u8bcd\u3002<\/p>\n<p>\u5f88\u591a\u6559\u7a0b\u5728\u8fd9\u91cc\u4f1a\u7529\u51fa\u4e00\u5806 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        Q<\/p>\n<p>        \u00d7<\/p>\n<p>         K<\/p>\n<p>         T<\/p>\n<p>       Q \\\\times K^T<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8413em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u516c\u5f0f\u628a\u4f60\u529d\u9000\u3002\u522b\u6015&#xff0c;\u6211\u4eec\u8fd8\u662f\u7528\u6545\u4e8b\u6765\u8bb2\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u56db\u7ae0&#xff1a;\u7075\u9b42\u6838\u5fc3\u2014\u2014\u81ea\u6ce8\u610f\u529b\u673a\u5236 (Self-Attention)<\/h2>\n<h4>1. \u5230\u5e95\u4ec0\u4e48\u662f\u201c\u81ea\u6ce8\u610f\u529b\u201d&#xff1f;<\/h4>\n<p>\u6211\u4eec\u5728\u7b2c\u4e00\u7ae0\u8bf4\u8fc7&#xff0c;RNN \u7684\u6bdb\u75c5\u662f\u201c\u8bfb\u5230\u540e\u9762\u5fd8\u524d\u9762\u201d\u3002\u800c Self-Attention \u7684\u6838\u5fc3\u601d\u60f3\u5c31\u662f&#xff1a;\u5728\u8bfb\u6bcf\u4e00\u4e2a\u8bcd\u7684\u65f6\u5019&#xff0c;\u90fd\u628a\u6574\u4e2a\u53e5\u5b50\u7684\u5176\u4ed6\u8bcd\u518d\u770b\u4e00\u904d&#xff0c;\u770b\u770b\u8c01\u8ddf\u81ea\u5df1\u6700\u4eb2\u3002<\/p>\n<p>\u8fd8\u662f\u90a3\u53e5\u7ecf\u5178\u7684\u4f8b\u5b50&#xff1a;<\/p>\n<p>\u201cThe animal didn\u2019t cross the street because it was too tired.\u201d<\/p>\n<p>\u5f53\u673a\u5668\u8bfb\u5230 \u201cit\u201d \u8fd9\u4e2a\u8bcd\u65f6&#xff0c;\u5b83\u7684\u6ce8\u610f\u529b\u5e94\u8be5\u653e\u5728\u54ea\u91cc&#xff1f;<\/p>\n<ul>\n<li>\u5b83\u5e94\u8be5\u53bb\u5173\u6ce8 \u201cstreet\u201d \u5417&#xff1f;\u4e0d\u5bf9&#xff0c;\u9a6c\u8def\u4e0d\u4f1a\u89c9\u5f97\u7d2f\u3002<\/li>\n<li>\u5b83\u5e94\u8be5\u53bb\u5173\u6ce8 \u201canimal\u201d \u5417&#xff1f;\u5bf9&#xff01;\u52a8\u7269\u624d\u4f1a\u89c9\u5f97\u7d2f\u3002<\/li>\n<\/ul>\n<p>Self-Attention \u5c31\u50cf\u4e00\u76cf\u805a\u5149\u706f\u3002\u5f53\u5904\u7406 \u201cit\u201d \u65f6&#xff0c;\u5b83\u4f1a\u628a\u706f\u5149\u6700\u5f3a\u70c8\u5730\u6253\u5728 \u201canimal\u201d \u8eab\u4e0a&#xff0c;\u628a \u201cstreet\u201d \u653e\u5728\u9634\u5f71\u91cc\u3002\u8fd9\u6837&#xff0c;\u201cit\u201d \u5c31\u5438\u6536\u4e86 \u201canimal\u201d \u7684\u542b\u4e49\u3002<\/p>\n<hr \/>\n<h4>2. \u6838\u5fc3\u94c1\u4e09\u89d2&#xff1a;Q\u3001K\u3001V \u7684\u8eab\u4e16\u4e4b\u8c1c<\/h4>\n<p>\u4e3a\u4e86\u5b9e\u73b0\u8fd9\u79cd\u201c\u627e\u5173\u7cfb\u201d\u7684\u80fd\u529b&#xff0c;Transformer \u628a\u8f93\u5165\u7684\u6bcf\u4e00\u4e2a\u8bcd\u5411\u91cf&#xff08;Vector&#xff09;&#xff0c;\u90fd\u5206\u88c2\u6210\u4e86\u4e09\u4e2a\u5206\u8eab\u3002<\/p>\n<p>\u8fd9\u4e09\u4e2a\u5206\u8eab\u662f\u901a\u8fc7\u5206\u522b\u4e58\u4ee5\u4e09\u4e2a\u4e0d\u540c\u7684\u6743\u91cd\u77e9\u9635&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         W<\/p>\n<p>         Q<\/p>\n<p>        ,<\/p>\n<p>         W<\/p>\n<p>         K<\/p>\n<p>        ,<\/p>\n<p>         W<\/p>\n<p>         V<\/p>\n<p>       W^Q, W^K, W^V<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.0358em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">Q<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff09;\u53d8\u51fa\u6765\u7684\u3002\u5b83\u4eec\u5206\u522b\u662f&#xff1a;<\/p>\n<li>\n<p>Query (\u67e5\u8be2\u5411\u91cf)&#xff1a;\u8fd9\u662f**\u201c\u6211\u624b\u4e2d\u7684\u724c\u201d**\u3002<\/p>\n<ul>\n<li>\u4ee3\u8868\u5f53\u524d\u8fd9\u4e2a\u8bcd\u53bb\u201c\u5bfb\u627e\u201d\u5176\u4ed6\u8bcd\u65f6\u7684\u8bc9\u6c42\u3002<\/li>\n<li>\u6bd4\u5982 \u201cit\u201d \u7684 Query \u5728\u558a&#xff1a;\u201c\u8c01\u662f\u80fd\u8ba9\u6211\u2018\u89c9\u5f97\u7d2f\u2019\u7684\u540d\u8bcd&#xff1f;\u201d<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>Key (\u952e\u5411\u91cf)&#xff1a;\u8fd9\u662f**\u201c\u4f60\u5934\u4e0a\u7684\u6807\u7b7e\u201d**\u3002<\/p>\n<ul>\n<li>\u4ee3\u8868\u8fd9\u4e2a\u8bcd\u88ab\u7528\u6765\u201c\u5339\u914d\u201d\u65f6\u7684\u7279\u5f81\u3002<\/li>\n<li>\u6bd4\u5982 \u201canimal\u201d \u7684 Key \u5199\u7740&#xff1a;\u201c\u6211\u662f\u540d\u8bcd&#xff0c;\u6211\u662f\u6d3b\u7269\u201d\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>Value (\u503c\u5411\u91cf)&#xff1a;\u8fd9\u662f**\u201c\u4f60\u7684\u5185\u6db5\u201d**\u3002<\/p>\n<ul>\n<li>\u4ee3\u8868\u8fd9\u4e2a\u8bcd\u771f\u6b63\u7684\u8bed\u4e49\u5185\u5bb9\u3002<\/li>\n<li>\u5982\u679c\u5339\u914d\u6210\u529f&#xff0c;\u5c31\u8981\u628a\u8fd9\u4e2a Value \u62ff\u8d70&#xff0c;\u878d\u5408\u5230 Query \u8eab\u4e0a\u3002<\/li>\n<\/ul>\n<\/li>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u56fe\u4e66\u9986\u68c0\u7d22\u7cfb\u7edf<\/h5>\n<p>\u4e3a\u4e86\u5f7b\u5e95\u641e\u61c2 Q\u3001K\u3001V&#xff0c;\u6211\u4eec\u53ef\u4ee5\u628a\u8fd9\u770b\u4f5c\u4e00\u6b21\u56fe\u4e66\u9986\u67e5\u4e66\u7684\u8fc7\u7a0b&#xff1a;<\/p>\n<ul>\n<li>Query (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          Q<\/p>\n<p>         Q<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span>)&#xff1a;\u662f\u4f60\u624b\u91cc\u7684\u501f\u4e66\u6761&#xff08;\u4e0a\u9762\u5199\u7740&#xff1a;\u6211\u8981\u627e\u5173\u4e8e\u201c\u5f88\u591a\u6bdb\u7684\u6d3b\u7269\u201d\u7684\u4e66&#xff09;\u3002<\/li>\n<li>Key (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          K<\/p>\n<p>         K<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span>)&#xff1a;\u662f\u4e66\u67b6\u4e0a\u6bcf\u4e00\u672c\u4e66\u810a\u4e0a\u7684\u5206\u7c7b\u7f16\u53f7&#xff08;\u6bd4\u5982&#xff1a;\u751f\u7269\u7c7b-\u54fa\u4e73\u52a8\u7269&#xff09;\u3002<\/li>\n<li>Value (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          V<\/p>\n<p>         V<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>)&#xff1a;\u662f\u4e66\u91cc\u9762\u771f\u6b63\u7684\u5185\u5bb9\u3002<\/li>\n<\/ul>\n<p>\u8fc7\u7a0b\u662f\u8fd9\u6837\u7684&#xff1a;<\/p>\n<li>\u4f60\u62ff\u7740\u501f\u4e66\u6761 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>        Q<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span>)&#xff0c;\u53bb\u548c\u6bcf\u4e00\u672c\u4e66\u7684\u5206\u7c7b\u53f7 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         K<\/p>\n<p>        K<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span>) \u8fdb\u884c\u6bd4\u5bf9\u3002<\/li>\n<li>\u5982\u679c\u4e0d\u5339\u914d&#xff0c;\u76f8\u4f3c\u5ea6\u5c31\u5f88\u4f4e&#xff08;\u6bd4\u5982\u90a3\u662f\u672c\u300a\u5fae\u79ef\u5206\u300b&#xff09;\u3002<\/li>\n<li>\u5982\u679c\u5339\u914d\u5ea6\u5f88\u9ad8&#xff08;\u662f\u672c\u300a\u52a8\u7269\u767e\u79d1\u300b&#xff09;&#xff0c;\u4f60\u5c31\u628a\u8fd9\u672c\u4e66\u7684\u5185\u5bb9 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         V<\/p>\n<p>        V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>) \u53d6\u51fa\u6765&#xff0c;\u8bfb\u8fdb\u8111\u5b50\u91cc\u3002<\/li>\n<hr \/>\n<h4>3. \u8ba1\u7b97\u56db\u6b65\u8d70&#xff1a;\u4ece\u76f8\u4eb2\u5230\u7275\u624b<\/h4>\n<p>\u673a\u5668\u5185\u90e8\u662f\u5982\u4f55\u901a\u8fc7\u6570\u5b66\u8fd0\u7b97\u6765\u6a21\u62df\u8fd9\u4e2a\u8fc7\u7a0b\u7684\u5462&#xff1f;\u5176\u5b9e\u5c31\u662f\u5411\u91cf\u70b9\u79ef&#xff08;Dot Product&#xff09;\u3002<\/p>\n<h5>\u7b2c\u4e00\u6b65&#xff1a;\u8ba1\u7b97\u5339\u914d\u5ea6 (Score)<\/h5>\n<p>\u62ff\u5f53\u524d\u8bcd\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        Q<\/p>\n<p>       Q<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span>&#xff0c;\u53bb\u4e58\u4ee5\u6240\u6709\u8bcd\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        K<\/p>\n<p>       K<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span>\u3002 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         S<\/p>\n<p>         c<\/p>\n<p>         o<\/p>\n<p>         r<\/p>\n<p>         e<\/p>\n<p>         &#061;<\/p>\n<p>         Q<\/p>\n<p>         \u22c5<\/p>\n<p>          K<\/p>\n<p>          T<\/p>\n<p>        Score &#061; Q \\\\cdot K^T<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0576em\">S<\/span><span class=\"mord mathnormal\">core<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8913em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8913em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<ul>\n<li>\u70b9\u79ef\u8d8a\u5927&#xff0c;\u4ee3\u8868\u4e24\u4e2a\u5411\u91cf\u8d8a\u76f8\u4f3c&#xff0c;\u4e5f\u5c31\u662f\u5173\u7cfb\u8d8a\u7d27\u5bc6\u3002<\/li>\n<li>\u201cit\u201d \u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>        Q<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span> \u548c \u201canimal\u201d \u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         K<\/p>\n<p>        K<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span> \u70b9\u79ef&#xff0c;\u5f97\u5206 0.9&#xff08;\u5f88\u9ad8&#xff09;\u3002<\/li>\n<li>\u201cit\u201d \u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>        Q<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span> \u548c \u201cstreet\u201d \u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         K<\/p>\n<p>        K<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span> \u70b9\u79ef&#xff0c;\u5f97\u5206 0.1&#xff08;\u5f88\u4f4e&#xff09;\u3002<\/li>\n<\/ul>\n<h5>\u7b2c\u4e8c\u6b65&#xff1a;\u5f52\u4e00\u5316\u4e0e\u6253\u5206 (Softmax)<\/h5>\n<p>\u4e3a\u4e86\u8ba9\u5206\u6570\u53d8\u6210\u6982\u7387&#xff08;\u52a0\u8d77\u6765\u7b49\u4e8e 1&#xff09;&#xff0c;\u6211\u4eec\u9700\u8981\u8fc7\u4e00\u4e2a Softmax \u51fd\u6570\u3002 \u5728\u6b64\u4e4b\u524d&#xff0c;\u4e3a\u4e86\u9632\u6b62\u5206\u6570\u592a\u5927\u5bfc\u81f4\u68af\u5ea6\u6d88\u5931&#xff0c;\u901a\u5e38\u4f1a\u9664\u4ee5\u4e00\u4e2a\u7cfb\u6570&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          d<\/p>\n<p>          k<\/p>\n<p>       \\\\sqrt{d_k}<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.04em;vertical-align: -0.1828em\"><\/span><span class=\"mord sqrt\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8572em\"><span class=\"svg-align\" style=\"top: -3em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\" style=\"padding-left: 0.833em\"><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0315em\">k<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -2.8172em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"hide-tail\" style=\"min-width: 0.853em;height: 1.08em\"><\/p>\n<p>           <\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1828em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff0c;\u6bd4\u5982 8&#xff09;\u3002<\/p>\n<ul>\n<li>\u5904\u7406\u540e&#xff1a;\u201canimal\u201d \u7684\u6743\u91cd\u53d8\u6210 88%&#xff0c;\u201cstreet\u201d \u7684\u6743\u91cd\u53d8\u6210 2%&#xff0c;\u5176\u4ed6\u7684\u8bcd\u5206\u5269\u4e0b\u7684 10%\u3002<\/li>\n<\/ul>\n<h5>\u7b2c\u4e09\u6b65&#xff1a;\u52a0\u6743\u6c42\u548c (Weighted Sum)<\/h5>\n<p>\u8fd9\u662f\u6700\u540e\u4e00\u6b65&#xff0c;\u4e5f\u662f\u6700\u795e\u5947\u7684\u4e00\u6b65\u3002 \u7528\u521a\u624d\u7b97\u51fa\u6765\u7684\u6743\u91cd&#xff0c;\u53bb\u4e58\u4ee5\u6240\u6709\u8bcd\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        V<\/p>\n<p>       V<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>&#xff0c;\u7136\u540e\u52a0\u5728\u4e00\u8d77\u3002<\/p>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         O<\/p>\n<p>         u<\/p>\n<p>         t<\/p>\n<p>         p<\/p>\n<p>         u<\/p>\n<p>         t<\/p>\n<p>         &#061;<\/p>\n<p>         0.88<\/p>\n<p>         \u00d7<\/p>\n<p>          V<\/p>\n<p>           a<\/p>\n<p>           n<\/p>\n<p>           i<\/p>\n<p>           m<\/p>\n<p>           a<\/p>\n<p>           l<\/p>\n<p>         &#043;<\/p>\n<p>         0.02<\/p>\n<p>         \u00d7<\/p>\n<p>          V<\/p>\n<p>           s<\/p>\n<p>           t<\/p>\n<p>           r<\/p>\n<p>           e<\/p>\n<p>           e<\/p>\n<p>           t<\/p>\n<p>         &#043;<\/p>\n<p>         .<\/p>\n<p>         .<\/p>\n<p>         .<\/p>\n<p>        Output &#061; 0.88 \\\\times V_{animal} &#043; 0.02 \\\\times V_{street} &#043; &#8230;<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">O<\/span><span class=\"mord mathnormal\">u<\/span><span class=\"mord mathnormal\">tp<\/span><span class=\"mord mathnormal\">u<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">0.88<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.2222em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">anima<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">0.02<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.2222em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">s<\/span><span class=\"mord mathnormal mtight\">t<\/span><span class=\"mord mathnormal mtight\">ree<\/span><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.1056em\"><\/span><span class=\"mord\">&#8230;<\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p>\u7ed3\u679c&#xff1a; \u6b64\u65f6&#xff0c;\u201cit\u201d \u8fd9\u4e2a\u8bcd\u7ecf\u8fc7\u8fd9\u4e00\u5c42\u5904\u7406\u540e&#xff0c;\u5b83\u7684\u5411\u91cf\u91cc\u867d\u7136\u8fd8\u4fdd\u7559\u7740 \u201cit\u201d \u7684\u5f71\u5b50&#xff0c;\u4f46\u5927\u90e8\u5206\u6210\u5206\u5df2\u7ecf\u53d8\u6210\u4e86 \u201canimal\u201d \u7684\u7279\u5f81\u3002\u5b83\u4e0d\u518d\u662f\u4e00\u4e2a\u5355\u7eaf\u7684\u4ee3\u8bcd&#xff0c;\u5b83\u201c\u5438\u661f\u5927\u6cd5\u201d\u5438\u53d6\u4e86\u4e0a\u4e0b\u6587\u7684\u7cbe\u534e&#xff01;<\/p>\n<hr \/>\n<h4>4. \u6d41\u7a0b\u56fe\u89e3&#xff1a;QKV \u7684\u5185\u90e8\u5de5\u5382<\/h4>\n<p>\u8fd9\u4e2a\u8fc7\u7a0b\u662f\u6240\u6709\u8bcd\u540c\u65f6\u8fdb\u884c\u7684&#xff08;\u77e9\u9635\u8fd0\u7b97&#xff09;\u3002\u8ba9\u6211\u4eec\u770b\u4e0b Meramid \u56fe\u89e3&#xff1a;<\/p>\n<p>  #mermaid-svg-Ndar7XvFdmue8SMl{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-Ndar7XvFdmue8SMl .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-Ndar7XvFdmue8SMl .error-icon{fill:#552222;}#mermaid-svg-Ndar7XvFdmue8SMl .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-Ndar7XvFdmue8SMl .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-Ndar7XvFdmue8SMl .marker{fill:#333333;stroke:#333333;}#mermaid-svg-Ndar7XvFdmue8SMl .marker.cross{stroke:#333333;}#mermaid-svg-Ndar7XvFdmue8SMl svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-Ndar7XvFdmue8SMl p{margin:0;}#mermaid-svg-Ndar7XvFdmue8SMl .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-Ndar7XvFdmue8SMl .cluster-label text{fill:#333;}#mermaid-svg-Ndar7XvFdmue8SMl .cluster-label span{color:#333;}#mermaid-svg-Ndar7XvFdmue8SMl .cluster-label span p{background-color:transparent;}#mermaid-svg-Ndar7XvFdmue8SMl .label text,#mermaid-svg-Ndar7XvFdmue8SMl span{fill:#333;color:#333;}#mermaid-svg-Ndar7XvFdmue8SMl .node rect,#mermaid-svg-Ndar7XvFdmue8SMl .node circle,#mermaid-svg-Ndar7XvFdmue8SMl .node ellipse,#mermaid-svg-Ndar7XvFdmue8SMl .node polygon,#mermaid-svg-Ndar7XvFdmue8SMl .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-Ndar7XvFdmue8SMl .rough-node .label text,#mermaid-svg-Ndar7XvFdmue8SMl .node .label text,#mermaid-svg-Ndar7XvFdmue8SMl .image-shape .label,#mermaid-svg-Ndar7XvFdmue8SMl .icon-shape .label{text-anchor:middle;}#mermaid-svg-Ndar7XvFdmue8SMl .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-Ndar7XvFdmue8SMl .rough-node .label,#mermaid-svg-Ndar7XvFdmue8SMl .node .label,#mermaid-svg-Ndar7XvFdmue8SMl .image-shape .label,#mermaid-svg-Ndar7XvFdmue8SMl .icon-shape .label{text-align:center;}#mermaid-svg-Ndar7XvFdmue8SMl .node.clickable{cursor:pointer;}#mermaid-svg-Ndar7XvFdmue8SMl .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-Ndar7XvFdmue8SMl .arrowheadPath{fill:#333333;}#mermaid-svg-Ndar7XvFdmue8SMl .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-Ndar7XvFdmue8SMl .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-Ndar7XvFdmue8SMl .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-Ndar7XvFdmue8SMl .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-Ndar7XvFdmue8SMl .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-Ndar7XvFdmue8SMl .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-Ndar7XvFdmue8SMl .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-Ndar7XvFdmue8SMl .cluster text{fill:#333;}#mermaid-svg-Ndar7XvFdmue8SMl .cluster span{color:#333;}#mermaid-svg-Ndar7XvFdmue8SMl div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-Ndar7XvFdmue8SMl .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-Ndar7XvFdmue8SMl rect.text{fill:none;stroke-width:0;}#mermaid-svg-Ndar7XvFdmue8SMl .icon-shape,#mermaid-svg-Ndar7XvFdmue8SMl .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-Ndar7XvFdmue8SMl .icon-shape p,#mermaid-svg-Ndar7XvFdmue8SMl .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-Ndar7XvFdmue8SMl .icon-shape rect,#mermaid-svg-Ndar7XvFdmue8SMl .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-Ndar7XvFdmue8SMl .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-Ndar7XvFdmue8SMl .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-Ndar7XvFdmue8SMl :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Self-Attention \u5185\u90e8\u5904\u7406\u673a\u5236<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u5206\u6570 (Scores)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u6ce8\u610f\u529b\u6743\u91cd (Weights)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165\u5411\u91cf X  (Embedding &#043; Pos)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>W_Q \u77e9\u9635<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Query (Q)  \u62ff\u7740\u501f\u4e66\u6761<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>W_K \u77e9\u9635<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Key (K)  \u4e66\u5206\u7c7b\u53f7<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>W_V \u77e9\u9635<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Value (V)  \u4e66\u7684\u5185\u5bb9<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u70b9\u79ef\u8fd0\u7b97 (MatMul)  Score &#061; Q \u00d7 K<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Softmax  (\u5f52\u4e00\u5316\u4e3a\u6982\u7387 %)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u52a0\u6743\u6c42\u548c (MatMul)  Output &#061; % \u00d7 V<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6700\u7ec8\u8f93\u51fa Z  (\u5438\u6536\u4e86\u4e0a\u4e0b\u6587\u4fe1\u606f\u7684\u5411\u91cf)<\/p>\n<p><\/span><\/p>\n<h4>5. \u4e3a\u4ec0\u4e48\u8981\u9664\u4ee5 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"> <\/p>\n<p>           d <\/p>\n<p>           k <\/p>\n<p>        \\\\sqrt{d_k} <\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.04em;vertical-align: -0.1828em\"><\/span><span class=\"mord sqrt\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8572em\"><span class=\"svg-align\" style=\"top: -3em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\" style=\"padding-left: 0.833em\"><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0315em\">k<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"\" style=\"top: -2.8172em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"hide-tail\" style=\"min-width: 0.853em;height: 1.08em\"> <\/p>\n<p>           <\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.1828em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff1f;<\/h4>\n<p>\u5982\u679c <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        Q<\/p>\n<p>       Q<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span> \u548c <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        K<\/p>\n<p>       K<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span> \u7684\u7ef4\u5ea6\u5f88\u5927&#xff0c;\u70b9\u79ef\u7684\u7ed3\u679c\u4f1a\u53d8\u5f97\u975e\u5e38\u5927\u3002\u8fd9\u5c31\u597d\u6bd4\u4f60\u5728 Softmax \u91cc\u8f93\u5165\u4e86\u4e00\u4e2a\u51e0\u5343\u7684\u6570\u5b57&#xff0c;Softmax \u5c31\u4f1a\u53d8\u5f97\u6781\u7aef\u2014\u2014\u53ea\u8ba4\u6700\u5927\u7684\u90a3\u4e2a&#xff0c;\u5176\u4ed6\u7684\u5168\u53d8\u6210 0&#xff0c;\u5bfc\u81f4\u68af\u5ea6\u51e0\u4e4e\u6d88\u5931&#xff0c;\u6a21\u578b\u5b66\u4e0d\u52a8\u4e86\u3002\u9664\u4ee5\u4e00\u4e2a\u6570\u662f\u4e3a\u4e86\u628a\u6570\u503c\u62c9\u56de\u201c\u8212\u9002\u533a\u201d\u3002<\/p>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>Q\u3001K\u3001V \u662f\u540c\u4e00\u4e2a\u8bcd\u7684\u4e09\u4e2a\u4e0d\u540c\u4fa7\u9762\u3002<\/li>\n<li>Self-Attention \u5c31\u662f\u62ff <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>        Q<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span> \u67e5 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         K<\/p>\n<p>        K<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span>&#xff0c;\u7b97\u51fa\u6743\u91cd&#xff0c;\u7136\u540e\u52a0\u6743\u6c47\u603b <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         V<\/p>\n<p>        V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>\u3002<\/li>\n<li>\u901a\u8fc7\u8fd9\u4e2a\u673a\u5236&#xff0c;\u5355\u8bcd\u4e0d\u518d\u662f\u5b64\u5c9b&#xff0c;\u5b83\u4eec\u6839\u636e\u8bed\u4e49\u5173\u7cfb\u5efa\u7acb\u4e86\u6df1\u5c42\u7684\u8fde\u63a5\u3002<\/li>\n<p>\u4f46\u662f&#xff01;\u4eba\u770b\u6587\u7ae0\u4e0d\u80fd\u53ea\u7528\u4e00\u79cd\u89c6\u89d2\u3002\u6709\u7684\u8981\u770b\u8bed\u6cd5&#xff0c;\u6709\u7684\u8981\u770b\u6307\u4ee3&#xff0c;\u6709\u7684\u8981\u770b\u60c5\u7eea\u3002 \u73b0\u5728\u7684 Attention \u53ea\u6709\u4e00\u7ec4 QKV&#xff0c;\u8fd9\u5c31\u50cf\u53ea\u6709\u4e00\u53ea\u773c\u775b\u3002<\/p>\n<hr \/>\n<p>\u5982\u679c Transformer \u53ea\u6709\u8fd9\u4e00\u4e2a\u8111\u888b&#xff0c;\u5b83\u53ef\u80fd\u4f1a\u6bd4\u8f83\u201c\u6b7b\u677f\u201d\u3002\u4e3a\u4e86\u8ba9\u5b83\u53d8\u6210\u771f\u6b63\u7684\u516d\u8fb9\u5f62\u6218\u58eb&#xff0c;Google \u7684\u5de5\u7a0b\u5e08\u8d4b\u4e88\u4e86\u5b83\u201c\u4e09\u5934\u516d\u81c2\u201d\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u4e94\u7ae0&#xff1a;\u591a\u5934\u6ce8\u610f\u529b (Multi-Head Attention)\u2014\u2014\u4e09\u4e2a\u81ed\u76ae\u5320&#xff0c;\u9876\u4e2a\u8bf8\u845b\u4eae<\/h2>\n<h4>1. \u4e3a\u4ec0\u4e48\u8981\u641e\u8fd9\u4e48\u591a\u8111\u888b&#xff1f;<\/h4>\n<p>\u60f3\u8c61\u4e00\u4e0b&#xff0c;\u4f60\u6b63\u5728\u8bfb\u8fd9\u53e5\u5f71\u8bc4&#xff1a;<\/p>\n<p>\u201cThe movie was long but good.\u201d<\/p>\n<p>\u5982\u679c\u4f60\u53ea\u6709\u4e00\u79cd\u601d\u7ef4\u65b9\u5f0f&#xff08;\u4e00\u4e2a Head&#xff09;&#xff0c;\u4f60\u53ef\u80fd\u53ea\u5173\u6ce8\u5230\u4e86 movie \u548c long \u7684\u5173\u7cfb&#xff08;\u63cf\u8ff0\u65f6\u957f&#xff09;\u3002 \u4f46\u662f&#xff0c;\u8fd9\u53e5\u8bdd\u91cc\u660e\u663e\u8fd8\u6709\u53e6\u4e00\u5c42\u66f4\u91cd\u8981\u7684\u903b\u8f91&#xff1a;\u5c3d\u7ba1 long&#xff08;\u901a\u5e38\u662f\u8d2c\u4e49&#xff09;&#xff0c;\u4f46\u662f good&#xff08;\u8912\u4e49&#xff09;&#xff0c;\u8fd9\u91cc\u6709\u4e00\u4e2a\u8f6c\u6298\u5173\u7cfb\u3002<\/p>\n<p>\u5355\u5934\u6ce8\u610f\u529b\u7684\u7f3a\u9677\u5728\u4e8e&#xff1a;\u5b83\u53ef\u80fd\u5728\u540c\u4e00\u65f6\u95f4&#xff0c;\u53ea\u80fd\u4e13\u6ce8\u4e8e\u4e00\u79cd\u7c7b\u578b\u7684\u8bed\u4e49\u5173\u8054\u3002\u5982\u679c\u5b83\u5fd9\u7740\u770b\u8bed\u6cd5\u7ed3\u6784&#xff0c;\u53ef\u80fd\u5c31\u5ffd\u7565\u4e86\u60c5\u611f\u8272\u5f69\u3002<\/p>\n<p>Multi-Head Attention&#xff08;\u591a\u5934\u6ce8\u610f\u529b&#xff09; \u7684\u6838\u5fc3\u601d\u60f3\u5c31\u662f&#xff1a; \u65e2\u7136\u4e00\u4e2a\u8111\u888b\u4e0d\u591f\u7528&#xff0c;\u90a3\u6211\u5c31\u641e 8 \u4e2a&#xff01;<\/p>\n<ul>\n<li>Head 1&#xff1a;\u4e13\u95e8\u8d1f\u8d23\u770b\u8bed\u6cd5\u7ed3\u6784&#xff08;\u8c01\u662f\u4e3b\u8bed&#xff0c;\u8c01\u662f\u5bbe\u8bed&#xff09;\u3002<\/li>\n<li>Head 2&#xff1a;\u4e13\u95e8\u8d1f\u8d23\u770b\u6307\u4ee3\u5173\u7cfb&#xff08;it \u6307\u7684\u662f\u8c01&#xff09;\u3002<\/li>\n<li>Head 3&#xff1a;\u4e13\u95e8\u8d1f\u8d23\u770b\u65f6\u6001\u4fe1\u606f&#xff08;\u662f\u8fc7\u53bb\u53d1\u751f\u8fd8\u662f\u5c06\u6765\u53d1\u751f&#xff09;\u3002<\/li>\n<li>Head 4&#xff1a;\u4e13\u95e8\u8d1f\u8d23\u770b\u60c5\u611f\u8272\u5f69&#xff08;\u662f\u5938\u8fd8\u662f\u9a82&#xff09;\u3002<\/li>\n<li>\u2026<\/li>\n<\/ul>\n<p>\u5927\u5bb6\u5404\u53f8\u5176\u804c&#xff0c;\u6700\u540e\u628a\u7ed3\u679c\u6c47\u603b\u8d77\u6765&#xff0c;\u8fd9\u5c31\u6784\u6210\u4e86\u4e00\u4e2a\u5bf9\u53e5\u5b50\u5168\u65b9\u4f4d\u3001\u591a\u89d2\u5ea6\u7684\u7406\u89e3\u3002<\/p>\n<hr \/>\n<h4>2. \u600e\u4e48\u201c\u780d\u201d\u6210\u591a\u5934&#xff1f;&#xff08;Split &amp; Concat&#xff09;<\/h4>\n<p>\u4f60\u53ef\u80fd\u4f1a\u62c5\u5fc3&#xff1a;\u201c\u641e 8 \u4e2a\u5934&#xff0c;\u8ba1\u7b97\u91cf\u5c82\u4e0d\u662f\u53d8\u6210\u4e86\u539f\u6765\u7684 8 \u500d&#xff1f;\u6211\u7684\u663e\u5361\u8fd8\u8981\u4e0d\u8981\u4e86&#xff1f;\u201d<\/p>\n<p>\u8fd9\u5c31\u662f Transformer \u8bbe\u8ba1\u7cbe\u5999\u7684\u5730\u65b9\u3002\u5b83\u5e76\u6ca1\u6709\u628a\u8ba1\u7b97\u91cf\u7ffb\u500d&#xff0c;\u800c\u662f\u628a\u539f\u6765\u7684\u5411\u91cf\u201c\u5207\u201d\u788e\u4e86\u3002<\/p>\n<p>\u5047\u8bbe\u539f\u672c\u7684\u8bcd\u5411\u91cf\u7ef4\u5ea6&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         d<\/p>\n<p>          m<\/p>\n<p>          o<\/p>\n<p>          d<\/p>\n<p>          e<\/p>\n<p>          l<\/p>\n<p>       d_{model}<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mathnormal mtight\">m<\/span><span class=\"mord mathnormal mtight\">o<\/span><span class=\"mord mathnormal mtight\">d<\/span><span class=\"mord mathnormal mtight\">e<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0197em\">l<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff09;\u662f 512\u3002 \u6211\u4eec\u8981\u641e 8 \u4e2a\u5934 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        h<\/p>\n<p>        &#061;<\/p>\n<p>        8<\/p>\n<p>       h&#061;8<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6944em\"><\/span><span class=\"mord mathnormal\">h<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">8<\/span><\/span><\/span><\/span><\/span>)\u3002<\/p>\n<p>Transformer \u662f\u8fd9\u6837\u505a\u7684&#xff1a;<\/p>\n<li>\u5207\u5206&#xff1a;\u5b83\u628a 512 \u7ef4\u7684\u5411\u91cf&#xff0c;\u5207\u6210\u4e86 8 \u4efd&#xff0c;\u6bcf\u4efd\u53ea\u6709 64 \u7ef4 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          d<\/p>\n<p>          k<\/p>\n<p>         &#061;<\/p>\n<p>         512<\/p>\n<p>         \/<\/p>\n<p>         8<\/p>\n<p>         &#061;<\/p>\n<p>         64<\/p>\n<p>        d_k &#061; 512 \/ 8 &#061; 64<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0315em\">k<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord\">512\/8<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">64<\/span><\/span><\/span><\/span><\/span>)\u3002<\/li>\n<li>\u5e76\u884c&#xff1a;\u8fd9 8 \u4e2a\u201c\u5c0f\u8111\u888b\u201d\u5206\u522b\u62ff\u7740\u81ea\u5df1\u90a3 64 \u7ef4\u7684\u6570\u636e&#xff0c;\u540c\u65f6\u53bb\u8fdb\u884c <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>         ,<\/p>\n<p>         K<\/p>\n<p>         ,<\/p>\n<p>         V<\/p>\n<p>        Q, K, V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span> \u7684\u81ea\u6ce8\u610f\u529b\u8ba1\u7b97\u3002<\/li>\n<li>\u72ec\u7acb&#xff1a;\u6ce8\u610f&#xff01;\u6bcf\u4e2a\u5934\u90fd\u6709\u81ea\u5df1\u72ec\u4eab\u7684\u4e00\u7ec4 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          W<\/p>\n<p>          Q<\/p>\n<p>         ,<\/p>\n<p>          W<\/p>\n<p>          K<\/p>\n<p>         ,<\/p>\n<p>          W<\/p>\n<p>          V<\/p>\n<p>        W^Q, W^K, W^V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.0358em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">Q<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u6743\u91cd\u77e9\u9635\u3002\u8fd9\u610f\u5473\u7740 Head 1 \u5b66\u5230\u7684\u4e1c\u897f&#xff0c;Head 2 \u6839\u672c\u4e0d\u77e5\u9053&#xff0c;\u4e92\u4e0d\u5e72\u6270\u3002<\/li>\n<li>\u62fc\u63a5&#xff1a;\u7b49 8 \u4e2a\u5934\u90fd\u7b97\u5b8c\u4e86&#xff0c;\u5206\u522b\u8f93\u51fa\u4e86 8 \u4e2a 64 \u7ef4\u7684\u5411\u91cf\u3002\u6211\u4eec\u5c31\u628a\u5b83\u4eec\u9996\u5c3e\u76f8\u8fde&#xff08;Concat&#xff09;&#xff0c;\u91cd\u65b0\u62fc\u56de\u6210\u4e00\u4e2a 512 \u7ef4\u7684\u5927\u5411\u91cf\u3002<\/li>\n<p>\u8fd9\u5c31\u597d\u6bd4&#xff1a; \u672c\u6765\u662f\u4e00\u4e2a\u5927\u795e&#xff08;512\u7ef4&#xff09;\u5728\u8bfb\u4e00\u672c\u4e66\u3002 \u73b0\u5728\u53d8\u6210\u4e86 8 \u4e2a\u666e\u901a\u4eba&#xff08;64\u7ef4&#xff09;&#xff0c;\u6bcf\u4e2a\u4eba\u53ea\u8bfb\u4e00\u7ae0\u3002 \u867d\u7136\u5355\u4eba\u7684\u80fd\u529b\u5f31\u4e86&#xff0c;\u4f46\u5927\u5bb6\u540c\u65f6\u8bfb&#xff0c;\u6700\u540e\u5f00\u4f1a\u628a\u8bfb\u540e\u611f\u62fc\u5728\u4e00\u8d77&#xff0c;\u6548\u7387\u6781\u9ad8\u4e14\u89c6\u89d2\u4e30\u5bcc\u3002<\/p>\n<hr \/>\n<h4>3. \u6700\u540e\u7684\u878d\u5408&#xff1a;\u7ebf\u6027\u53d8\u6362 (Linear)<\/h4>\n<p>\u628a 8 \u4e2a\u5934\u7684\u8f93\u51fa\u62fc\u8d77\u6765\u4e4b\u540e&#xff0c;\u867d\u7136\u7ef4\u5ea6\u53d8\u56de\u4e86 512&#xff0c;\u4f46\u5b83\u4eec\u4e4b\u95f4\u8fd8\u662f\u201c\u751f\u5206\u201d\u7684&#xff08;\u524d\u9762 64 \u4f4d\u662f Head 1 \u7684\u770b\u6cd5&#xff0c;\u540e\u9762 64 \u4f4d\u662f Head 2 \u7684\u770b\u6cd5\u2026&#xff09;\u3002<\/p>\n<p>\u6240\u4ee5&#xff0c;\u6700\u540e\u5fc5\u987b\u52a0\u4e00\u4e2a\u7ebf\u6027\u53d8\u6362\u5c42&#xff08;Linear Layer, <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          W<\/p>\n<p>          O<\/p>\n<p>        W^O<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8413em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">O<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff09;\u3002<\/p>\n<p>\u8fd9\u4e2a\u5c42\u7684\u4f5c\u7528\u5c31\u50cf\u662f\u4f1a\u8bae\u4e3b\u6301\u4eba\u6216\u8005\u9e21\u5c3e\u9152\u8c03\u9152\u5e08\u3002\u5b83\u628a\u8fd9\u4e00\u6761\u62fc\u63a5\u597d\u7684\u957f\u5411\u91cf\u6df7\u5408\u4e00\u4e0b&#xff0c;\u8ba9\u4e0d\u540c\u5934\u7684\u4fe1\u606f\u8fdb\u884c\u4ea4\u4e92\u548c\u878d\u5408&#xff0c;\u6700\u7ec8\u751f\u6210\u4e00\u4e2a\u7edf\u4e00\u7684\u8f93\u51fa\u5411\u91cf\u3002<\/p>\n<hr \/>\n<h4>4. \u6d41\u7a0b\u56fe\u89e3&#xff1a;\u591a\u5934\u5de5\u5382<\/h4>\n<p>\u8fd9\u4e00\u90e8\u5206\u6700\u9002\u5408\u7528\u56fe\u6765\u770b\u3002\u4f60\u4f1a\u53d1\u73b0\u5b83\u50cf\u6781\u4e86\u5de5\u5382\u91cc\u7684\u5206\u6d41\u2014\u52a0\u5de5\u2014\u6c47\u6d41\u6d41\u6c34\u7ebf\u3002<\/p>\n<p>  #mermaid-svg-8uJnM5X3JTCyX47j{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-8uJnM5X3JTCyX47j .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-8uJnM5X3JTCyX47j .error-icon{fill:#552222;}#mermaid-svg-8uJnM5X3JTCyX47j .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-8uJnM5X3JTCyX47j .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-8uJnM5X3JTCyX47j .marker{fill:#333333;stroke:#333333;}#mermaid-svg-8uJnM5X3JTCyX47j .marker.cross{stroke:#333333;}#mermaid-svg-8uJnM5X3JTCyX47j svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-8uJnM5X3JTCyX47j p{margin:0;}#mermaid-svg-8uJnM5X3JTCyX47j .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-8uJnM5X3JTCyX47j .cluster-label text{fill:#333;}#mermaid-svg-8uJnM5X3JTCyX47j .cluster-label span{color:#333;}#mermaid-svg-8uJnM5X3JTCyX47j .cluster-label span p{background-color:transparent;}#mermaid-svg-8uJnM5X3JTCyX47j .label text,#mermaid-svg-8uJnM5X3JTCyX47j span{fill:#333;color:#333;}#mermaid-svg-8uJnM5X3JTCyX47j .node rect,#mermaid-svg-8uJnM5X3JTCyX47j .node circle,#mermaid-svg-8uJnM5X3JTCyX47j .node ellipse,#mermaid-svg-8uJnM5X3JTCyX47j .node polygon,#mermaid-svg-8uJnM5X3JTCyX47j .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-8uJnM5X3JTCyX47j .rough-node .label text,#mermaid-svg-8uJnM5X3JTCyX47j .node .label text,#mermaid-svg-8uJnM5X3JTCyX47j .image-shape .label,#mermaid-svg-8uJnM5X3JTCyX47j .icon-shape .label{text-anchor:middle;}#mermaid-svg-8uJnM5X3JTCyX47j .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-8uJnM5X3JTCyX47j .rough-node .label,#mermaid-svg-8uJnM5X3JTCyX47j .node .label,#mermaid-svg-8uJnM5X3JTCyX47j .image-shape .label,#mermaid-svg-8uJnM5X3JTCyX47j .icon-shape .label{text-align:center;}#mermaid-svg-8uJnM5X3JTCyX47j .node.clickable{cursor:pointer;}#mermaid-svg-8uJnM5X3JTCyX47j .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-8uJnM5X3JTCyX47j .arrowheadPath{fill:#333333;}#mermaid-svg-8uJnM5X3JTCyX47j .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-8uJnM5X3JTCyX47j .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-8uJnM5X3JTCyX47j .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-8uJnM5X3JTCyX47j .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-8uJnM5X3JTCyX47j .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-8uJnM5X3JTCyX47j .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-8uJnM5X3JTCyX47j .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-8uJnM5X3JTCyX47j .cluster text{fill:#333;}#mermaid-svg-8uJnM5X3JTCyX47j .cluster span{color:#333;}#mermaid-svg-8uJnM5X3JTCyX47j div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-8uJnM5X3JTCyX47j .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-8uJnM5X3JTCyX47j rect.text{fill:none;stroke-width:0;}#mermaid-svg-8uJnM5X3JTCyX47j .icon-shape,#mermaid-svg-8uJnM5X3JTCyX47j .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-8uJnM5X3JTCyX47j .icon-shape p,#mermaid-svg-8uJnM5X3JTCyX47j .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-8uJnM5X3JTCyX47j .icon-shape rect,#mermaid-svg-8uJnM5X3JTCyX47j .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-8uJnM5X3JTCyX47j .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-8uJnM5X3JTCyX47j .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-8uJnM5X3JTCyX47j :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Multi-Head Attention \u673a\u5236<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u5e76\u884c\u5904\u7406 (8\u4e2a\u5e73\u884c\u5b87\u5b99)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Q,K,V \u8ba1\u7b97<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Q,K,V \u8ba1\u7b97<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>Q,K,V \u8ba1\u7b97<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Head 1 (\u5173\u6ce8\u8bed\u6cd5)  64\u7ef4<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165\u5411\u91cf Z (512\u7ef4)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u5207\u5206<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Head 2 (\u5173\u6ce8\u6307\u4ee3)  64\u7ef4<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>&#8230;<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Head 8 (\u5173\u6ce8\u60c5\u611f)  64\u7ef4<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa 1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa 2<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa 8<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u62fc\u63a5 (Concat)  64 x 8 &#061; 512\u7ef4<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u7ebf\u6027\u53d8\u6362 (Linear\/WO)  \u4fe1\u606f\u5927\u878d\u5408<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6700\u7ec8\u8f93\u51fa (Multi-Head Output)<\/p>\n<p><\/span><\/p>\n<h4>5. \u8fd9\u91cc\u7684\u201c\u5f69\u86cb\u201d<\/h4>\n<p>\u867d\u7136\u6211\u4eec\u5728\u56fe\u91cc\u8bf4\u662f\u201c\u5173\u6ce8\u8bed\u6cd5\u201d\u3001\u201c\u5173\u6ce8\u60c5\u611f\u201d&#xff0c;\u4f46\u5176\u5b9e\u8fd9\u662f\u6211\u4eec\u4eba\u7c7b\u7684\u4e8b\u540e\u8bf8\u845b\u4eae\u3002 \u5728\u8bad\u7ec3\u5f00\u59cb\u524d&#xff0c;\u673a\u5668\u5e76\u4e0d\u77e5\u9053 Head 1 \u5e94\u8be5\u5b66\u4ec0\u4e48\u3002 \u8fd9\u4e9b\u529f\u80fd\u662f AI \u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u81ea\u5df1\u5b66\u4f1a\u7684\u3002<\/p>\n<ul>\n<li>\u6709\u65f6\u5019\u4f60\u4f1a\u53d1\u73b0 Head 3 \u548c Head 4 \u5b66\u91cd\u4e86&#xff0c;\u8fd9\u88ab\u79f0\u4e3a\u201c\u591a\u5934\u5197\u4f59\u201d&#xff0c;\u4f46\u4e3a\u4e86\u4fdd\u9669\u8d77\u89c1&#xff0c;\u5197\u4f59\u4e00\u70b9\u4e5f\u6ca1\u5173\u7cfb\u3002<\/li>\n<\/ul>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>Multi-Head \u89e3\u51b3\u4e86\u5355\u5934\u6ce8\u610f\u529b\u89c6\u89d2\u5355\u4e00\u7684\u95ee\u9898\u3002<\/li>\n<li>\u5b83\u901a\u8fc7\u964d\u4f4e\u7ef4\u5ea6\u6765\u5b9e\u73b0\u5e76\u884c\u591a\u8def\u8ba1\u7b97&#xff0c;\u8ba1\u7b97\u603b\u91cf\u5e76\u6ca1\u6709\u66b4\u589e\u3002<\/li>\n<li>\u5b83\u5305\u542b\u4e86&#xff1a;\u5207\u5206 -&gt; \u72ec\u7acbAttention -&gt; \u62fc\u63a5 -&gt; \u7ebf\u6027\u878d\u5408 \u56db\u4e2a\u6b65\u9aa4\u3002<\/li>\n<p>\u5230\u8fd9\u91cc&#xff0c;Attention \u90e8\u5206\u7684\u6838\u5fc3\u5c31\u8bb2\u5b8c\u4e86&#xff01;\u4f46\u662f&#xff0c;\u5982\u679c\u4f60\u628a\u73b0\u5728\u7684\u6a21\u578b\u53e0\u4e2a 100 \u5c42&#xff0c;\u5b83\u53ef\u80fd\u4f1a\u6839\u672c\u8bad\u7ec3\u4e0d\u8d77\u6765&#xff0c;\u56e0\u4e3a\u6570\u636e\u4f20\u7740\u4f20\u7740\u5c31\u201c\u6b7b\u201d\u4e86&#xff08;\u68af\u5ea6\u6d88\u5931\/\u7206\u70b8&#xff09;\u3002<\/p>\n<p>\u4e3a\u4e86\u8ba9\u8fd9\u680b\u5927\u697c\u80fd\u76d6\u5f97\u66f4\u9ad8\u66f4\u7a33&#xff0c;\u6211\u4eec\u9700\u8981\u5f15\u5165\u4e24\u4e2a\u5efa\u7b51\u5b66\u795e\u5668&#xff1a;\u6b8b\u5dee\u8fde\u63a5 (Residual Connection) \u548c \u5c42\u5f52\u4e00\u5316 (Layer Normalization)\u3002<\/p>\n<hr \/>\n<p>\u5982\u679c\u628a Transformer \u6bd4\u4f5c\u76d6\u697c&#xff0c;\u524d\u51e0\u7ae0\u6211\u4eec\u5df2\u7ecf\u9020\u597d\u4e86\u6700\u6838\u5fc3\u7684\u7816\u5757&#xff08;Attention&#xff09;\u3002\u4f46\u662f&#xff0c;\u53ea\u8981\u7a0d\u5fae\u6709\u4e9b\u5de5\u7a0b\u7ecf\u9a8c\u7684\u4eba\u90fd\u77e5\u9053&#xff1a;\u697c\u76d6\u5f97\u592a\u9ad8&#xff0c;\u662f\u4f1a\u584c\u7684\u3002<\/p>\n<p>\u5728\u6df1\u5ea6\u5b66\u4e60\u91cc&#xff0c;\u8fd9\u4e2a\u201c\u584c\u201d\u8868\u73b0\u4e3a\u68af\u5ea6\u6d88\u5931&#xff08;Gradient Vanishing&#xff09;\u6216\u8005\u7f51\u7edc\u9000\u5316\u2014\u2014\u8d8a\u6df1\u7684\u7f51\u7edc\u53cd\u800c\u8d8a\u7b28\u3002<\/p>\n<p>\u4e3a\u4e86\u8ba9 Transformer \u80fd\u591f\u6beb\u65e0\u987e\u8651\u5730\u53e0\u4e0a\u51e0\u5341\u5c42\u751a\u81f3\u4e0a\u767e\u5c42&#xff0c;Google \u5de5\u7a0b\u5e08\u5f15\u5165\u4e86\u4e24\u6839\u201c\u5b9a\u6d77\u795e\u9488\u201d&#xff1a;\u6b8b\u5dee\u8fde\u63a5&#xff08;Residual Connection&#xff09; \u548c \u5c42\u5f52\u4e00\u5316&#xff08;Layer Normalization&#xff09;\u3002<\/p>\n<p>\u8fd9\u4e00\u7ae0&#xff0c;\u6211\u4eec\u770b\u770b\u8fd9\u4e24\u4e2a\u542c\u8d77\u6765\u5f88\u9ad8\u5927\u4e0a\u3001\u5b9e\u9645\u4e0a\u5374\u975e\u5e38\u6734\u7d20\u7684\u673a\u5236\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u516d\u7ae0&#xff1a;\u8fde\u63a5\u4e0e\u5f52\u4e00\u5316\u2014\u2014\u9632\u6b62\u201c\u697c\u5012\u584c\u201d\u7684\u5de5\u7a0b\u5b66<\/h2>\n<p>\u5728 Transformer \u7684\u6bcf\u4e00\u5c42&#xff08;Layer&#xff09;\u91cc&#xff0c;\u65e0\u8bba\u662f Attention \u8fd8\u662f\u540e\u9762\u8981\u8bb2\u7684 FFN&#xff0c;\u90fd\u88ab\u5305\u88f9\u5728\u4e00\u4e2a\u56fa\u5b9a\u7684\u7ed3\u6784\u91cc&#xff1a;Add &amp; Norm\u3002<\/p>\n<h4>1. \u6b8b\u5dee\u8fde\u63a5 (Residual Connection\/Add)&#xff1a;\u7ed9\u4fe1\u606f\u7559\u6761\u9000\u8def<\/h4>\n<p>\u8fd8\u8bb0\u5f97\u6211\u4eec\u524d\u51e0\u7ae0\u8bb2\u7684 Multi-Head Attention \u5417&#xff1f;\u867d\u7136\u5b83\u5f88\u5389\u5bb3&#xff0c;\u4f46\u4e07\u4e00\u8fd9\u4e00\u5c42\u201c\u8111\u62bd\u201d\u4e86&#xff0c;\u628a\u539f\u672c\u597d\u7684\u4fe1\u606f\u7ed9\u7b97\u574f\u4e86\u600e\u4e48\u529e&#xff1f;<\/p>\n<p>\u6b8b\u5dee\u8fde\u63a5&#xff08;\u6e90\u81ea ResNet&#xff09;\u7684\u6838\u5fc3\u601d\u60f3\u5c31\u662f&#xff1a;\u4e0d\u8981\u53ea\u76f8\u4fe1\u4f60\u7684\u8fd0\u7b97\u7ed3\u679c&#xff0c;\u8fd8\u8981\u4fdd\u7559\u539f\u59cb\u7684\u8f93\u5165\u3002<\/p>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         O<\/p>\n<p>         u<\/p>\n<p>         t<\/p>\n<p>         p<\/p>\n<p>         u<\/p>\n<p>         t<\/p>\n<p>         &#061;<\/p>\n<p>         L<\/p>\n<p>         a<\/p>\n<p>         y<\/p>\n<p>         e<\/p>\n<p>         r<\/p>\n<p>         (<\/p>\n<p>         x<\/p>\n<p>         )<\/p>\n<p>         &#043;<\/p>\n<p>         x<\/p>\n<p>        Output &#061; Layer(x) &#043; x<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">O<\/span><span class=\"mord mathnormal\">u<\/span><span class=\"mord mathnormal\">tp<\/span><span class=\"mord mathnormal\">u<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\">L<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">yer<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\">x<\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u6284\u4f5c\u4e1a\u7684\u827a\u672f<\/h5>\n<p>\u60f3\u8c61\u4f60\u5728\u505a\u6570\u5b66\u9898&#xff08;\u8fd9\u4e00\u5c42\u7f51\u7edc&#xff09;\u3002<\/p>\n<ul>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          x<\/p>\n<p>         x<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\">x<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u662f\u4f60\u4e0a\u4e00\u6b21\u8003\u8bd5\u7684\u6210\u7ee9&#xff08;\u6216\u8005\u53c2\u8003\u7b54\u6848&#xff09;\u3002<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          L<\/p>\n<p>          a<\/p>\n<p>          y<\/p>\n<p>          e<\/p>\n<p>          r<\/p>\n<p>          (<\/p>\n<p>          x<\/p>\n<p>          )<\/p>\n<p>         Layer(x)<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\">L<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">yer<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u662f\u4f60\u73b0\u5728\u7684\u8ba1\u7b97\u8fc7\u7a0b\u3002<\/li>\n<\/ul>\n<p>\u6b8b\u5dee\u8fde\u63a5\u7684\u610f\u601d\u662f&#xff1a;\u4f60\u53ef\u4ee5\u53bb\u8ba1\u7b97\u65b0\u7684\u7ed3\u679c&#xff0c;\u4f46\u6700\u540e\u4ea4\u5377\u7684\u65f6\u5019&#xff0c;\u8981\u628a\u4f60\u7b97\u51fa\u6765\u7684\u4e1c\u897f\u52a0\u4e0a\u539f\u6765\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         x<\/p>\n<p>        x<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\">x<\/span><\/span><\/span><\/span><\/span>\u3002<\/p>\n<p>\u8fd9\u6837\u8bbe\u8ba1\u6709\u4e24\u4e2a\u5de8\u5927\u7684\u597d\u5904&#xff1a;<\/p>\n<li>\u4fdd\u5e95\u673a\u5236&#xff1a;\u4e07\u4e00\u4f60\u8fd9\u6b21\u7b97\u5f97\u4e00\u56e2\u7cdf&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         L<\/p>\n<p>         a<\/p>\n<p>         y<\/p>\n<p>         e<\/p>\n<p>         r<\/p>\n<p>         (<\/p>\n<p>         x<\/p>\n<p>         )<\/p>\n<p>        Layer(x)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\">L<\/span><span class=\"mord mathnormal\">a<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">yer<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span> \u63a5\u8fd1 0 \u6216\u4e71\u7801&#xff09;&#xff0c;\u53ea\u8981\u52a0\u4e0a <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         x<\/p>\n<p>        x<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\">x<\/span><\/span><\/span><\/span><\/span>&#xff0c;\u81f3\u5c11\u7ed3\u679c\u4e0d\u4f1a\u6bd4\u539f\u6765\u66f4\u5dee\u3002\u8fd9\u76f8\u5f53\u4e8e\u7ed9\u4fe1\u606f\u5f00\u4e86\u4e00\u6761VIP \u76f4\u901a\u8f66&#xff0c;\u8ba9\u5b83\u80fd\u65e0\u635f\u901a\u8fc7\u3002<\/li>\n<li>\u68af\u5ea6\u9ad8\u901f\u516c\u8def&#xff1a;\u5728\u53cd\u5411\u4f20\u64ad&#xff08;\u8bad\u7ec3&#xff09;\u65f6&#xff0c;\u68af\u5ea6\u53ef\u4ee5\u901a\u8fc7\u8fd9\u6761\u76f4\u8fde\u901a\u8def&#xff0c;\u7545\u901a\u65e0\u963b\u5730\u4f20\u56de\u6700\u5f00\u59cb\u7684\u5c42&#xff0c;\u5f7b\u5e95\u89e3\u51b3\u4e86\u201c\u68af\u5ea6\u6d88\u5931\u201d\u8fd9\u4e2a\u5343\u53e4\u96be\u9898\u3002<\/li>\n<h4>2. \u5c42\u5f52\u4e00\u5316 (Layer Normalization\/Norm)&#xff1a;\u8ba9\u5927\u5bb6\u90fd\u5728\u8d77\u8dd1\u7ebf\u4e0a<\/h4>\n<p>\u795e\u7ecf\u7f51\u7edc\u6700\u6015\u201c\u5927\u8d77\u5927\u843d\u201d\u3002 \u5982\u679c\u7b2c\u4e00\u5c42\u7684\u8f93\u51fa\u662f 0.5&#xff0c;\u7b2c\u4e8c\u5c42\u53d8\u6210\u4e86 500&#xff0c;\u7b2c\u4e09\u5c42\u53d8\u6210\u4e86 50000\u2026\u2026\u8fd9\u6570\u503c\u4e00\u7206\u70b8&#xff0c;\u540e\u9762\u7684\u6570\u5b66\u516c\u5f0f\u5c31\u6ca1\u6cd5\u7b97\u4e86&#xff08;\u6fc0\u6d3b\u51fd\u6570\u4f1a\u9971\u548c&#xff09;\u3002<\/p>\n<p>\u6211\u4eec\u9700\u8981\u628a\u6bcf\u4e00\u5c42\u7684\u8f93\u51fa&#xff0c;\u90fd\u5f3a\u884c\u62c9\u56de\u5230\u4e00\u4e2a\u6807\u51c6\u7684\u8303\u56f4&#xff08;\u6bd4\u5982\u5747\u503c\u4e3a 0&#xff0c;\u65b9\u5dee\u4e3a 1&#xff09;\u3002<\/p>\n<p>\u5728\u56fe\u50cf\u5904\u7406&#xff08;CV&#xff09;\u91cc&#xff0c;\u5927\u5bb6\u5e38\u7528 Batch Norm&#xff08;\u6309\u6279\u6b21\u5f52\u4e00\u5316&#xff09;\u3002\u4f46\u5728 NLP \u91cc&#xff0c;\u56e0\u4e3a\u53e5\u5b50\u957f\u77ed\u4e0d\u4e00&#xff0c;Batch Norm \u5f88\u96be\u7528\u3002\u6240\u4ee5 Transformer \u7528\u7684\u662f Layer Norm\u3002<\/p>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u6807\u51c6\u5316\u8003\u8bd5\u5206\u6570<\/h5>\n<p>\u60f3\u8c61\u73ed\u91cc\u6709\u5b66\u9738\u548c\u5b66\u6e23\u3002<\/p>\n<ul>\n<li>\u8fd9\u6b21\u8003\u8bd5\u592a\u96be&#xff0c;\u5927\u5bb6\u5e73\u5747\u5206\u53ea\u6709 30 \u5206\u3002<\/li>\n<li>\u4e0b\u6b21\u8003\u8bd5\u592a\u7b80\u5355&#xff0c;\u5927\u5bb6\u5e73\u5747\u5206 90 \u5206\u3002<\/li>\n<\/ul>\n<p>\u5982\u679c\u4e0d\u5904\u7406&#xff0c;\u8fd9\u4e24\u6b21\u5206\u6570\u7684\u542b\u4e49\u5b8c\u5168\u4e0d\u540c\u3002Layer Norm \u505a\u7684\u4e8b\u60c5\u5c31\u662f&#xff1a; \u4e0d\u7ba1\u5377\u5b50\u96be\u6613&#xff0c;\u6211\u628a\u4f60\u8fd9\u4e00\u884c\u7684\u6240\u6709\u5206\u6570\u8fdb\u884c\u201c\u7f29\u653e\u201d\u3002<\/p>\n<ul>\n<li>\u628a\u5e73\u5747\u5206\u5f3a\u884c\u62c9\u56de 0\u3002<\/li>\n<li>\u628a\u5206\u6563\u7a0b\u5ea6\u5f3a\u884c\u62c9\u56de 1\u3002<\/li>\n<\/ul>\n<p>\u8fd9\u6837&#xff0c;\u65e0\u8bba\u6570\u636e\u5728\u91cc\u9762\u600e\u4e48\u6298\u817e&#xff0c;\u4e0b\u4e00\u5c42\u7f51\u7edc\u63a5\u6536\u5230\u7684&#xff0c;\u6c38\u8fdc\u662f\u5206\u5e03\u7a33\u5b9a\u3001\u6e05\u723d\u5e72\u51c0\u7684\u6570\u636e\u3002<\/p>\n<h4>3. Add &amp; Norm \u7684\u7ec4\u5408\u62f3<\/h4>\n<p>\u5728 Transformer \u7684\u8bba\u6587\u56fe\u91cc&#xff0c;\u4f60\u4f1a\u770b\u5230\u6bcf\u4e2a\u6a21\u5757\u5468\u56f4\u90fd\u6709\u4e00\u5708\u7ebf\u3002\u8fd9\u5c31\u662f Add &amp; Norm \u7684\u5de5\u4f5c\u6d41\u7a0b\u3002<\/p>\n<li>Input \u5175\u5206\u4e24\u8def\u3002<\/li>\n<li>\u4e00\u8def\u53bb\u5e72\u6d3b&#xff08;\u505a Attention \u8ba1\u7b97&#xff09;\u3002<\/li>\n<li>\u4e00\u8def\u5728\u65c1\u8fb9\u8eba\u5e73&#xff08;\u4fdd\u7559\u539f\u6837&#xff09;\u3002<\/li>\n<li>\u5e72\u5b8c\u6d3b\u56de\u6765&#xff0c;\u628a\u4e24\u8def\u7ed3\u679c\u76f8\u52a0&#xff08;Add&#xff09;\u3002<\/li>\n<li>\u6700\u540e\u628a\u7ed3\u679c\u62ff\u53bb\u6807\u51c6\u5316&#xff08;Norm&#xff09;\u3002<\/li>\n<h4>4. \u6d41\u7a0b\u56fe\u89e3&#xff1a;VIP \u901a\u9053\u4e0e\u5b89\u68c0\u95e8<\/h4>\n<p>\u8ba9\u6211\u4eec\u7528 Mermaid \u770b\u770b\u8fd9\u4e2a\u7ed3\u6784\u662f\u5982\u4f55\u5305\u88f9 Attention \u7684\u3002<\/p>\n<p>  #mermaid-svg-v2LbDUFgISpGP2mm{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-v2LbDUFgISpGP2mm .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-v2LbDUFgISpGP2mm .error-icon{fill:#552222;}#mermaid-svg-v2LbDUFgISpGP2mm .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-v2LbDUFgISpGP2mm .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-v2LbDUFgISpGP2mm .marker{fill:#333333;stroke:#333333;}#mermaid-svg-v2LbDUFgISpGP2mm .marker.cross{stroke:#333333;}#mermaid-svg-v2LbDUFgISpGP2mm svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-v2LbDUFgISpGP2mm p{margin:0;}#mermaid-svg-v2LbDUFgISpGP2mm .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-v2LbDUFgISpGP2mm .cluster-label text{fill:#333;}#mermaid-svg-v2LbDUFgISpGP2mm .cluster-label span{color:#333;}#mermaid-svg-v2LbDUFgISpGP2mm .cluster-label span p{background-color:transparent;}#mermaid-svg-v2LbDUFgISpGP2mm .label text,#mermaid-svg-v2LbDUFgISpGP2mm span{fill:#333;color:#333;}#mermaid-svg-v2LbDUFgISpGP2mm .node rect,#mermaid-svg-v2LbDUFgISpGP2mm .node circle,#mermaid-svg-v2LbDUFgISpGP2mm .node ellipse,#mermaid-svg-v2LbDUFgISpGP2mm .node polygon,#mermaid-svg-v2LbDUFgISpGP2mm .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-v2LbDUFgISpGP2mm .rough-node .label text,#mermaid-svg-v2LbDUFgISpGP2mm .node .label text,#mermaid-svg-v2LbDUFgISpGP2mm .image-shape .label,#mermaid-svg-v2LbDUFgISpGP2mm .icon-shape .label{text-anchor:middle;}#mermaid-svg-v2LbDUFgISpGP2mm .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-v2LbDUFgISpGP2mm .rough-node .label,#mermaid-svg-v2LbDUFgISpGP2mm .node .label,#mermaid-svg-v2LbDUFgISpGP2mm .image-shape .label,#mermaid-svg-v2LbDUFgISpGP2mm .icon-shape .label{text-align:center;}#mermaid-svg-v2LbDUFgISpGP2mm .node.clickable{cursor:pointer;}#mermaid-svg-v2LbDUFgISpGP2mm .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-v2LbDUFgISpGP2mm .arrowheadPath{fill:#333333;}#mermaid-svg-v2LbDUFgISpGP2mm .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-v2LbDUFgISpGP2mm .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-v2LbDUFgISpGP2mm .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-v2LbDUFgISpGP2mm .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-v2LbDUFgISpGP2mm .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-v2LbDUFgISpGP2mm .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-v2LbDUFgISpGP2mm .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-v2LbDUFgISpGP2mm .cluster text{fill:#333;}#mermaid-svg-v2LbDUFgISpGP2mm .cluster span{color:#333;}#mermaid-svg-v2LbDUFgISpGP2mm div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-v2LbDUFgISpGP2mm .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-v2LbDUFgISpGP2mm rect.text{fill:none;stroke-width:0;}#mermaid-svg-v2LbDUFgISpGP2mm .icon-shape,#mermaid-svg-v2LbDUFgISpGP2mm .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-v2LbDUFgISpGP2mm .icon-shape p,#mermaid-svg-v2LbDUFgISpGP2mm .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-v2LbDUFgISpGP2mm .icon-shape rect,#mermaid-svg-v2LbDUFgISpGP2mm .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-v2LbDUFgISpGP2mm .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-v2LbDUFgISpGP2mm .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-v2LbDUFgISpGP2mm :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Add &amp; Norm \u6a21\u5757\u8be6\u89e3<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u52aa\u529b\u5e72\u6d3b\u533a<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u53bb\u8ba1\u7b97 F(x)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>VIP \u76f4\u901a\u8f66 (\u539f\u6837\u4fdd\u7559 x)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u8ba1\u7b97\u7ed3\u679c<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Multi-Head Attention  (\u6216\u8005 FFN)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165\u5411\u91cf x<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u5206\u6d41\u70b9<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\">&#043;<\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Layer Norm  (\u5f52\u4e00\u5316\u5904\u7406)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa &#061; Norm(x &#043; F(x))<\/p>\n<p><\/span><\/p>\n<p>\u56fe\u89e3\u8bf4\u660e&#xff1a;\u6ce8\u610f\u90a3\u6761\u865a\u7ebf&#xff0c;\u5b83\u5c31\u662f\u6b8b\u5dee\u8fde\u63a5\u3002\u5b83\u7ed5\u8fc7\u4e86\u590d\u6742\u7684 Attention \u8ba1\u7b97&#xff0c;\u76f4\u63a5\u8fde\u5230\u4e86\u52a0\u53f7\u4e0a\u3002\u8fd9\u5c31\u662f\u6df1\u5c42\u7f51\u7edc\u4e0d\u9000\u5316\u7684\u79d8\u5bc6\u6b66\u5668\u3002<\/p>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>\u6b8b\u5dee\u8fde\u63a5 (Add)&#xff1a;\u7ed9\u6570\u636e\u5f00\u540e\u95e8&#xff0c;\u4fdd\u8bc1\u6df1\u5c42\u7f51\u7edc\u4e5f\u80fd\u8bad\u7ec3&#xff0c;\u9632\u6b62\u5b66\u5f97\u8d8a\u4e45\u8d8a\u50bb\u3002<\/li>\n<li>\u5c42\u5f52\u4e00\u5316 (Norm)&#xff1a;\u628a\u6570\u636e\u6574\u7406\u5e72\u51c0&#xff0c;\u9632\u6b62\u6570\u503c\u7206\u70b8\u6216\u6d88\u5931&#xff0c;\u8ba9\u8bad\u7ec3\u66f4\u7a33\u5b9a\u3002<\/li>\n<li>Add &amp; Norm \u662f Transformer \u7684\u201c\u6df7\u51dd\u571f\u201d&#xff0c;\u6bcf\u4e00\u5c42\u90fd\u79bb\u4e0d\u5f00\u5b83\u3002<\/li>\n<p>\u73b0\u5728&#xff0c;\u6211\u4eec\u7684\u8f93\u5165\u7ecf\u8fc7\u4e86 Attention&#xff08;\u63d0\u53d6\u7279\u5f81&#xff09;&#xff0c;\u53c8\u7ecf\u8fc7\u4e86 Add &amp; Norm&#xff08;\u7a33\u56fa\u7ed3\u6784&#xff09;\u3002<\/p>\n<p>\u63a5\u4e0b\u6765&#xff0c;\u6570\u636e\u5c06\u8fdb\u5165 Encoder \u8fd9\u4e00\u5c42\u7684\u6700\u540e\u4e00\u4e2a\u7ec4\u4ef6\u2014\u2014\u524d\u9988\u795e\u7ecf\u7f51\u7edc (Feed-Forward Networks)\u3002 \u5982\u679c\u8bf4 Attention \u662f\u201c\u89c2\u5bdf\u201d&#xff0c;\u90a3 FFN \u5c31\u662f\u201c\u601d\u8003\u201d\u548c\u201c\u6d88\u5316\u201d\u3002<\/p>\n<hr \/>\n<p>\u8fd9\u4e00\u7ae0\u6211\u4eec\u6765\u8bb2\u8bb2 Transformer \u91cc\u6700\u5bb9\u6613\u88ab\u5ffd\u89c6&#xff0c;\u4f46\u53c2\u6570\u91cf\u5176\u5b9e\u6700\u5927\u7684\u90e8\u5206\u3002<\/p>\n<p>\u5728\u524d\u9762\u7684\u7ae0\u8282\u91cc&#xff0c;Attention \u5c31\u50cf\u662f\u4e00\u4e2a\u793e\u4ea4\u8fbe\u4eba&#xff0c;\u5b83\u5fd9\u7740\u5728\u4e0d\u540c\u7684\u5355\u8bcd\u4e4b\u95f4\u7275\u7ebf\u642d\u6865&#xff0c;\u641e\u6e05\u695a\u201c\u8c01\u548c\u8c01\u662f\u4eb2\u621a\u201d\u3002<\/p>\n<p>\u4f46\u5149\u6709\u793e\u4ea4\u662f\u4e0d\u591f\u7684\u3002\u6536\u96c6\u4e86\u4e00\u5806\u4fe1\u606f\u4e4b\u540e&#xff0c;\u6bcf\u4e2a\u5355\u8bcd\u90fd\u9700\u8981\u56de\u5230\u81ea\u5df1\u7684\u5de5\u4f4d\u4e0a&#xff0c;\u51b7\u9759\u4e0b\u6765&#xff0c;\u628a\u8fd9\u4e9b\u4fe1\u606f\u6d88\u5316\u3001\u5438\u6536\u3001\u5185\u5316\u6210\u81ea\u5df1\u7684\u4e1c\u897f\u3002<\/p>\n<p>\u8fd9\u5c31\u662f \u524d\u9988\u795e\u7ecf\u7f51\u7edc (Feed-Forward Networks, FFN) \u7684\u5de5\u4f5c\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u4e03\u7ae0&#xff1a;\u524d\u9988\u795e\u7ecf\u7f51\u7edc (Feed-Forward Networks)\u2014\u2014\u8bb0\u5fc6\u7684\u7cbe\u70bc<\/h2>\n<h4>1. \u793e\u4ea4\u540e\u7684\u72ec\u5904&#xff1a;Position-wise<\/h4>\n<p>\u9996\u5148&#xff0c;\u6211\u4eec\u8981\u7ea0\u6b63\u4e00\u4e2a\u6982\u5ff5\u3002\u867d\u7136\u5b83\u53eb\u201c\u795e\u7ecf\u7f51\u7edc\u201d&#xff0c;\u4f46\u5728 Transformer \u7684\u8fd9\u4e00\u5c42\u91cc&#xff0c;\u5b83\u5176\u5b9e\u662f\u975e\u5e38\u5b64\u50fb\u7684\u3002<\/p>\n<ul>\n<li>Attention \u5c42&#xff1a;\u662f\u5168\u5c40\u7684\u3002\u5b83\u770b\u904d\u4e86\u6574\u4e2a\u53e5\u5b50&#xff0c;\u628a\u522b\u4eba\u7684\u4fe1\u606f\u62ff\u8fc7\u6765\u3002<\/li>\n<li>FFN \u5c42&#xff1a;\u662f\u72ec\u7acb\u7684&#xff08;Position-wise&#xff09;\u3002\u5b83\u5355\u72ec\u5904\u7406\u6bcf\u4e00\u4e2a\u8bcd\u5411\u91cf&#xff0c;\u4e0d\u770b\u5468\u56f4\u7684\u4eba\u3002<\/li>\n<\/ul>\n<p>\u6bd4\u55bb\u65f6\u523b&#xff1a;<\/p>\n<ul>\n<li>Attention \u662f\u5f00\u4f1a\u8ba8\u8bba\u3002\u5927\u5bb6\u4ea4\u6362\u610f\u89c1&#xff0c;\u6211\u542c\u542c\u4f60\u7684\u60f3\u6cd5&#xff0c;\u4f60\u542c\u542c\u6211\u7684\u60f3\u6cd5\u3002<\/li>\n<li>FFN \u662f\u4f1a\u540e\u603b\u7ed3\u3002\u6bcf\u4e2a\u4eba\u56de\u5230\u81ea\u5df1\u7684\u8111\u5b50\u91cc&#xff08;\u6216\u8005\u7b14\u8bb0\u672c\u4e0a&#xff09;&#xff0c;\u5355\u72ec\u6574\u7406\u521a\u624d\u542c\u5230\u7684\u5185\u5bb9&#xff0c;\u63d0\u70bc\u51fa\u91cd\u70b9\u3002<\/li>\n<\/ul>\n<p>\u6ce8\u610f&#xff1a;\u867d\u7136\u5927\u5bb6\u662f\u5404\u56de\u5404\u5bb6\u5355\u72ec\u5904\u7406&#xff0c;\u4f46\u5927\u5bb6\u7528\u7684\u201c\u5904\u7406\u903b\u8f91\u201d&#xff08;\u6743\u91cd\u77e9\u9635 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         W<\/p>\n<p>        W<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><\/span><\/span><\/span><\/span>&#xff09;\u662f\u4e00\u6a21\u4e00\u6837\u7684\u3002\u8fd9\u5c31\u50cf\u5168\u73ed\u540c\u5b66\u90fd\u7528\u540c\u4e00\u5957\u6a21\u7248\u5199\u603b\u7ed3\u3002<\/p>\n<h4>2. \u4e09\u660e\u6cbb\u7ed3\u6784&#xff1a;\u5bbd\u8fdb\u7a84\u51fa<\/h4>\n<p>FFN \u7684\u5185\u90e8\u7ed3\u6784\u975e\u5e38\u7b80\u5355&#xff0c;\u5c31\u50cf\u4e00\u4e2a\u4e09\u660e\u6cbb&#xff0c;\u6216\u8005\u8bf4\u662f\u4e00\u4e2a\u81a8\u80c0\u2014\u6536\u7f29\u7684\u8fc7\u7a0b\u3002<\/p>\n<p>\u5b83\u7531\u4e24\u4e2a\u7ebf\u6027\u53d8\u6362&#xff08;Linear&#xff09;\u5939\u7740\u4e00\u4e2a\u975e\u7ebf\u6027\u6fc0\u6d3b\u51fd\u6570&#xff08;ReLU&#xff09;\u7ec4\u6210\u3002<\/p>\n<li>\n<p>\u7b2c\u4e00\u5c42 Linear&#xff08;\u81a8\u80c0&#xff09;&#xff1a; \u628a\u8f93\u5165\u7684\u7ef4\u5ea6&#xff08;\u6bd4\u5982 512&#xff09;\u5f3a\u884c\u53d8\u5927&#xff08;\u901a\u5e38\u53d8\u5927 4 \u500d&#xff0c;\u53d8\u6210 2048&#xff09;\u3002<\/p>\n<ul>\n<li>\u4e3a\u4ec0\u4e48\u8981\u53d8\u5927&#xff1f; \u5c31\u50cf\u628a\u4e00\u4e2a\u6298\u53e0\u7684\u7eb8\u56e2\u5c55\u5f00\u3002\u5728\u4f4e\u7ef4\u7a7a\u95f4\u6324\u5728\u4e00\u8d77\u5206\u4e0d\u6e05\u7684\u7279\u5f81&#xff0c;\u6295\u5c04\u5230\u9ad8\u7ef4\u7a7a\u95f4\u540e&#xff0c;\u66f4\u5bb9\u6613\u628a\u5b83\u4eec\u5206\u79bb\u5f00\u6765\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>\u6fc0\u6d3b\u51fd\u6570&#xff08;ReLU&#xff09;&#xff1a; \u8fd9\u662f\u6ce8\u5165\u201c\u7075\u9b42\u201d\u7684\u4e00\u6b65\u3002\u5982\u679c\u6ca1\u6709\u5b83&#xff0c;\u4e24\u5c42 Linear \u53e0\u5728\u4e00\u8d77\u5176\u5b9e\u8fd8\u662f\u7b49\u4e8e\u4e00\u5c42 Linear\u3002ReLU \u8d1f\u8d23\u628a\u90a3\u4e9b\u8d1f\u80fd\u91cf&#xff08;\u8d1f\u6570&#xff09;\u780d\u6389&#xff0c;\u53ea\u4fdd\u7559\u6b63\u80fd\u91cf\u3002<\/p>\n<ul>\n<li>\u516c\u5f0f&#xff1a;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>           R<\/p>\n<p>           e<\/p>\n<p>           L<\/p>\n<p>           U<\/p>\n<p>           (<\/p>\n<p>           x<\/p>\n<p>           )<\/p>\n<p>           &#061;<\/p>\n<p>           max<\/p>\n<p>           \u2061<\/p>\n<p>           (<\/p>\n<p>           0<\/p>\n<p>           ,<\/p>\n<p>           x<\/p>\n<p>           )<\/p>\n<p>          ReLU(x) &#061; \\\\max(0, x)<\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0077em\">R<\/span><span class=\"mord mathnormal\">e<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">LU<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mop\">max<\/span><span class=\"mopen\">(<\/span><span class=\"mord\">0<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/li>\n<\/ul>\n<\/li>\n<li>\n<p>\u7b2c\u4e8c\u5c42 Linear&#xff08;\u6536\u7f29&#xff09;&#xff1a; \u628a 2048 \u7ef4\u7684\u5411\u91cf\u518d\u538b\u7f29\u56de 512 \u7ef4\u3002<\/p>\n<ul>\n<li>\u4e3a\u4ec0\u4e48\u8981\u53d8\u56de\u6765&#xff1f; \u4e3a\u4e86\u548c\u4e0b\u4e00\u5c42\u7f51\u7edc\u5bf9\u63a5&#xff08;\u8fd8\u8bb0\u5f97\u6b8b\u5dee\u8fde\u63a5\u5417&#xff1f;\u7ef4\u5ea6\u5fc5\u987b\u4e00\u81f4\u624d\u80fd\u76f8\u52a0&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u505a\u9762\u56e2<\/h5>\n<ul>\n<li>\u8f93\u5165&#xff1a;\u4e00\u56e2\u521a\u63c9\u597d\u7684\u9762&#xff08;\u5305\u542b\u4e86 Attention \u6536\u96c6\u7684\u4fe1\u606f&#xff09;\u3002<\/li>\n<li>\u7b2c\u4e00\u6b65&#xff08;\u53d8\u5bbd&#xff09;&#xff1a;\u628a\u9762\u56e2\u7528\u529b\u64c0\u5f00&#xff0c;\u94fa\u6ee1\u6574\u4e2a\u684c\u5b50\u3002\u8fd9\u6837\u4f60\u80fd\u770b\u6e05\u9762\u7c89\u91cc\u7684\u6bcf\u4e00\u4e2a\u7ec6\u8282\u3002<\/li>\n<li>\u7b2c\u4e8c\u6b65&#xff08;ReLU&#xff09;&#xff1a;\u5728\u9762\u997c\u4e0a\u6492\u70b9\u6599&#xff0c;\u6216\u8005\u5207\u6389\u8fb9\u7f18\u4e0d\u6574\u9f50\u7684\u90e8\u5206\u3002<\/li>\n<li>\u7b2c\u4e09\u6b65&#xff08;\u53d8\u7a84&#xff09;&#xff1a;\u628a\u9762\u997c\u91cd\u65b0\u63c9\u56de\u4e00\u4e2a\u5c0f\u9762\u56e2\u3002<\/li>\n<\/ul>\n<p>\u6b64\u65f6&#xff0c;\u8fd9\u4e2a\u9762\u56e2\u867d\u7136\u5916\u89c2\u6ca1\u53d8&#xff08;\u8fd8\u662f 512 \u7ef4&#xff09;&#xff0c;\u4f46\u5185\u90e8\u7684\u7b4b\u9053&#xff08;\u7279\u5f81&#xff09;\u5df2\u7ecf\u5b8c\u5168\u4e0d\u540c\u4e86\u3002<\/p>\n<h4>3. \u6570\u5b66\u516c\u5f0f&#xff08;\u770b\u4e00\u773c\u5c31\u884c&#xff09;<\/h4>\n<p><span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         F<\/p>\n<p>         F<\/p>\n<p>         N<\/p>\n<p>         (<\/p>\n<p>         x<\/p>\n<p>         )<\/p>\n<p>         &#061;<\/p>\n<p>         max<\/p>\n<p>         \u2061<\/p>\n<p>         (<\/p>\n<p>         0<\/p>\n<p>         ,<\/p>\n<p>         x<\/p>\n<p>          W<\/p>\n<p>          1<\/p>\n<p>         &#043;<\/p>\n<p>          b<\/p>\n<p>          1<\/p>\n<p>         )<\/p>\n<p>          W<\/p>\n<p>          2<\/p>\n<p>         &#043;<\/p>\n<p>          b<\/p>\n<p>          2<\/p>\n<p>        FFN(x) &#061; \\\\max(0, xW_1 &#043; b_1)W_2 &#043; b_2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">FFN<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mclose\">)<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mop\">max<\/span><span class=\"mopen\">(<\/span><span class=\"mord\">0<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">b<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8444em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">b<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<ul>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         x<\/p>\n<p>        x<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.4306em\"><\/span><span class=\"mord mathnormal\">x<\/span><\/span><\/span><\/span><\/span>&#xff1a;\u8f93\u5165&#xff08;512\u7ef4&#xff09;<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          W<\/p>\n<p>          1<\/p>\n<p>        W_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff1a;\u628a 512 \u53d8\u6210 2048<\/li>\n<li><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          W<\/p>\n<p>          2<\/p>\n<p>        W_2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.1389em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span>&#xff1a;\u628a 2048 \u53d8\u6210 512<\/li>\n<\/ul>\n<h4>4. \u6d41\u7a0b\u56fe\u89e3&#xff1a;\u6570\u636e\u7684\u201c\u547c\u5438\u201d\u8fd0\u52a8<\/h4>\n<p>\u8ba9\u6211\u4eec\u770b\u770b\u6570\u636e\u5728\u8fd9\u4e2a\u73af\u8282\u662f\u5982\u4f55\u201c\u5438\u6c14\u81a8\u80c0\u201d\u518d\u201c\u547c\u6c14\u6536\u7f29\u201d\u7684\u3002<\/p>\n<p>  #mermaid-svg-OncFgLMfEeOkAQz1{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-OncFgLMfEeOkAQz1 .error-icon{fill:#552222;}#mermaid-svg-OncFgLMfEeOkAQz1 .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-OncFgLMfEeOkAQz1 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-OncFgLMfEeOkAQz1 .marker{fill:#333333;stroke:#333333;}#mermaid-svg-OncFgLMfEeOkAQz1 .marker.cross{stroke:#333333;}#mermaid-svg-OncFgLMfEeOkAQz1 svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-OncFgLMfEeOkAQz1 p{margin:0;}#mermaid-svg-OncFgLMfEeOkAQz1 .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster-label text{fill:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster-label span{color:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster-label span p{background-color:transparent;}#mermaid-svg-OncFgLMfEeOkAQz1 .label text,#mermaid-svg-OncFgLMfEeOkAQz1 span{fill:#333;color:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 .node rect,#mermaid-svg-OncFgLMfEeOkAQz1 .node circle,#mermaid-svg-OncFgLMfEeOkAQz1 .node ellipse,#mermaid-svg-OncFgLMfEeOkAQz1 .node polygon,#mermaid-svg-OncFgLMfEeOkAQz1 .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-OncFgLMfEeOkAQz1 .rough-node .label text,#mermaid-svg-OncFgLMfEeOkAQz1 .node .label text,#mermaid-svg-OncFgLMfEeOkAQz1 .image-shape .label,#mermaid-svg-OncFgLMfEeOkAQz1 .icon-shape .label{text-anchor:middle;}#mermaid-svg-OncFgLMfEeOkAQz1 .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-OncFgLMfEeOkAQz1 .rough-node .label,#mermaid-svg-OncFgLMfEeOkAQz1 .node .label,#mermaid-svg-OncFgLMfEeOkAQz1 .image-shape .label,#mermaid-svg-OncFgLMfEeOkAQz1 .icon-shape .label{text-align:center;}#mermaid-svg-OncFgLMfEeOkAQz1 .node.clickable{cursor:pointer;}#mermaid-svg-OncFgLMfEeOkAQz1 .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-OncFgLMfEeOkAQz1 .arrowheadPath{fill:#333333;}#mermaid-svg-OncFgLMfEeOkAQz1 .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-OncFgLMfEeOkAQz1 .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-OncFgLMfEeOkAQz1 .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-OncFgLMfEeOkAQz1 .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-OncFgLMfEeOkAQz1 .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-OncFgLMfEeOkAQz1 .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster text{fill:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 .cluster span{color:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-OncFgLMfEeOkAQz1 .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-OncFgLMfEeOkAQz1 rect.text{fill:none;stroke-width:0;}#mermaid-svg-OncFgLMfEeOkAQz1 .icon-shape,#mermaid-svg-OncFgLMfEeOkAQz1 .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-OncFgLMfEeOkAQz1 .icon-shape p,#mermaid-svg-OncFgLMfEeOkAQz1 .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-OncFgLMfEeOkAQz1 .icon-shape rect,#mermaid-svg-OncFgLMfEeOkAQz1 .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-OncFgLMfEeOkAQz1 .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-OncFgLMfEeOkAQz1 .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-OncFgLMfEeOkAQz1 :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>FFN \u5185\u90e8\u8fd0\u4f5c\u673a\u7406<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u7279\u5f81\u5c55\u5f00<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u7279\u5f81\u63d0\u70bc\u5b8c\u6210<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u5165\u5411\u91cf x  (\u7ef4\u5ea6: 512)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Linear Layer 1  (\u5347\u7ef4: 512 -&gt; 2048)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u4e2d\u95f4\u5c42\u5411\u91cf  (\u7ef4\u5ea6: 2048)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>ReLU \u6fc0\u6d3b\u51fd\u6570  (\u8fc7\u6ee4\/\u975e\u7ebf\u6027\u53d8\u6362)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6fc0\u6d3b\u540e\u7684\u5411\u91cf  (\u7ef4\u5ea6: 2048)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Linear Layer 2  (\u964d\u7ef4: 2048 -&gt; 512)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa\u5411\u91cf  (\u7ef4\u5ea6: 512)<\/p>\n<p><\/span><\/p>\n<h4>5. \u4e00\u4e2a\u6709\u8da3\u7684\u53d1\u73b0<\/h4>\n<p>\u4f60\u53ef\u80fd\u542c\u8bf4\u8fc7 GPT \u7684\u53c2\u6570\u91cf\u5f88\u5927&#xff08;\u6bd4\u5982 1750 \u4ebf\u53c2\u6570&#xff09;\u3002\u5176\u5b9e&#xff0c;\u8fd9\u5176\u4e2d\u6709 2\/3 \u7684\u53c2\u6570\u90fd\u96c6\u4e2d\u5728 FFN \u91cc&#xff01;<\/p>\n<ul>\n<li>Attention \u7684\u53c2\u6570\u53ea\u5360\u5f88\u5c11\u4e00\u90e8\u5206&#xff08;<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          W<\/p>\n<p>          Q<\/p>\n<p>         ,<\/p>\n<p>          W<\/p>\n<p>          K<\/p>\n<p>         ,<\/p>\n<p>          W<\/p>\n<p>          V<\/p>\n<p>        W^Q, W^K, W^V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.0358em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">Q<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.1389em\">W<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u90fd\u4e0d\u5927&#xff09;\u3002<\/li>\n<li>FFN \u7684\u90a3\u4e2a\u201c\u4e2d\u95f4\u5c42\u201d&#xff08;2048 \u751a\u81f3\u66f4\u5927&#xff09;\u6d88\u8017\u4e86\u5927\u91cf\u7684\u77e9\u9635\u7a7a\u95f4\u3002<\/li>\n<\/ul>\n<p>\u6709\u7814\u7a76\u8ba4\u4e3a&#xff0c;Attention \u8d1f\u8d23\u903b\u8f91\u548c\u5173\u7cfb&#xff0c;\u800c FFN \u771f\u7684\u5c31\u50cf\u662f\u201c\u5927\u8111\u7684\u8bb0\u5fc6\u533a\u201d&#xff0c;\u91cc\u9762\u5b58\u50a8\u4e86\u5927\u91cf\u7684\u5177\u4f53\u77e5\u8bc6&#xff08;\u6bd4\u5982\u201c\u6cd5\u56fd\u7684\u9996\u90fd\u662f\u5df4\u9ece\u201d\u8fd9\u79cd\u4e8b\u5b9e&#xff0c;\u53ef\u80fd\u5c31\u5b58\u5728 FFN \u7684\u6743\u91cd\u91cc&#xff09;\u3002<\/p>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>FFN \u7d27\u8ddf\u5728 Attention \u540e\u9762&#xff0c;\u8d1f\u8d23\u5bf9\u6bcf\u4e2a\u4f4d\u7f6e\u7684\u4fe1\u606f\u8fdb\u884c\u72ec\u7acb\u52a0\u5de5\u3002<\/li>\n<li>\u5b83\u662f\u4e00\u4e2a \u5bbd-\u7a84-\u5bbd&#xff08;\u6216\u8005\u8bf4\u7a84-\u5bbd-\u7a84&#xff0c;\u53d6\u51b3\u4e8e\u600e\u4e48\u770b&#xff09;\u7684\u7ed3\u6784\u3002<\/li>\n<li>\u5b83\u7684\u4f5c\u7528\u662f\u63d0\u70bc\u7279\u5f81&#xff0c;\u628a Attention \u6536\u96c6\u5230\u7684\u6742\u4e71\u4fe1\u606f\u6574\u5408\u6210\u6e05\u6670\u7684\u8bed\u4e49\u3002<\/li>\n<hr \/>\n<p>\u81f3\u6b64&#xff0c;Encoder&#xff08;\u7f16\u7801\u5668&#xff09;\u7684\u6240\u6709\u79d8\u5bc6\u90fd\u88ab\u6211\u4eec\u89e3\u5f00\u4e86&#xff01;<\/p>\n<p>Input -&gt; Embedding -&gt; Pos -&gt; [ Attention -&gt; Add&amp;Norm -&gt; FFN -&gt; Add&amp;Norm ] x N -&gt; Output<\/p>\n<p>\u73b0\u5728&#xff0c;\u6211\u4eec\u8981\u8de8\u8fc7\u90a3\u9053\u754c\u7ebf&#xff0c;\u8fdb\u5165\u53f3\u8fb9\u7684 Decoder&#xff08;\u89e3\u7801\u5668&#xff09;\u3002 \u867d\u7136 Decoder \u770b\u8d77\u6765\u548c Encoder \u5f88\u50cf&#xff0c;\u4f46\u5b83\u6709\u4e00\u4e2a\u975e\u5e38\u201c\u5fc3\u673a\u201d\u7684\u6539\u52a8\u2014\u2014\u5b83\u5728\u8003\u8bd5\u65f6\u4e0d\u80fd\u5077\u770b\u7b54\u6848\u3002<\/p>\n<hr \/>\n<p>\u5728 Encoder&#xff08;\u5de6\u8fb9\u7684\u5854&#xff09;\u91cc&#xff0c;\u6211\u4eec\u8981\u7406\u89e3\u4e00\u53e5\u8bdd&#xff0c;\u5f53\u7136\u662f\u4e00\u773c\u770b\u5b8c\u5168\u6587\u6700\u597d&#xff08;\u4e0a\u5e1d\u89c6\u89d2&#xff09;\u3002 \u4f46\u5728 Decoder&#xff08;\u53f3\u8fb9\u7684\u5854&#xff09;\u91cc&#xff0c;\u4efb\u52a1\u53d8\u4e86\u3002Decoder \u7684\u4efb\u52a1\u662f\u751f\u6210\u3002<\/p>\n<p>\u800c\u5728\u751f\u6210\u7684\u65f6\u5019&#xff0c;\u6700\u5fcc\u8bb3\u7684\u5c31\u662f\u2014\u2014\u5267\u900f\u3002<\/p>\n<hr \/>\n<h2>\u7b2c\u516b\u7ae0&#xff1a;\u89e3\u7801\u5668\u7684\u72ec\u89d2\u620f\u2014\u2014Masked Multi-Head Attention<\/h2>\n<h4>1. \u4e3a\u4ec0\u4e48 Decoder \u4e0d\u80fd\u6709\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d&#xff1f;<\/h4>\n<p>\u60f3\u8c61\u4f60\u5728\u53c2\u52a0\u82f1\u8bed\u8003\u8bd5&#xff0c;\u9898\u76ee\u662f\u4e2d\u8bd1\u82f1&#xff1a;<\/p>\n<p>\u4e2d\u6587&#xff1a;\u6211\u7231\u5b66\u4e60 \u7b54\u6848&#xff1a;I love study<\/p>\n<p>\u5728\u8bad\u7ec3\u7684\u65f6\u5019&#xff0c;\u6211\u4eec\u662f\u628a\u6b63\u786e\u7b54\u6848 I love study \u5582\u7ed9 Decoder \u7684&#xff08;\u8fd9\u53eb Teacher Forcing&#xff09;\u3002<\/p>\n<p>\u4f46\u662f&#xff0c;\u5982\u679c Decoder \u5728\u9884\u6d4b I \u7684\u65f6\u5019&#xff0c;\u5df2\u7ecf\u5077\u770b\u5230\u4e86\u540e\u9762\u7684 love \u548c study&#xff0c;\u90a3\u5b83\u8fd8\u5b66\u4e2a\u5c41\u554a&#xff1f;\u5b83\u76f4\u63a5\u7167\u6284\u4e0d\u5c31\u884c\u4e86&#xff1f; \u8fd9\u5c31\u597d\u6bd4\u8001\u5e08\u8ba9\u4f60\u586b\u7a7a&#xff0c;\u7ed3\u679c\u628a\u7b54\u6848\u5199\u5728\u4e86\u6a2a\u7ebf\u540e\u9762&#xff0c;\u4f60\u6839\u672c\u4e0d\u7528\u52a8\u8111\u5b50&#xff0c;\u76f4\u63a5\u6284\u5c31\u884c\u3002<\/p>\n<p>\u5230\u4e86\u771f\u6b63\u8003\u8bd5&#xff08;\u63a8\u7406\/\u5e94\u7528&#xff09;\u7684\u65f6\u5019&#xff0c;\u662f\u6ca1\u6709\u4eba\u7ed9\u4f60\u770b\u540e\u9762\u7684\u8bcd\u7684\u3002\u5982\u679c\u8bad\u7ec3\u65f6\u4e60\u60ef\u4e86\u4f5c\u5f0a&#xff0c;\u8003\u8bd5\u65f6\u7edd\u5bf9\u6293\u778e\u3002<\/p>\n<p>\u6240\u4ee5&#xff0c;\u6211\u4eec\u9700\u8981\u7ed9 Decoder \u6234\u4e0a\u4e00\u526f\u7279\u5236\u7684\u773c\u955c&#xff1a; \u5f53\u5b83\u8bfb\u5230\u7b2c <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         t<\/p>\n<p>        t<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6151em\"><\/span><span class=\"mord mathnormal\">t<\/span><\/span><\/span><\/span><\/span> \u4e2a\u8bcd\u7684\u65f6\u5019&#xff0c;\u5f3a\u884c\u8ba9\u5b83\u770b\u4e0d\u89c1 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         t<\/p>\n<p>        t<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6151em\"><\/span><span class=\"mord mathnormal\">t<\/span><\/span><\/span><\/span><\/span> \u4e4b\u540e\u7684\u8bcd\u3002<\/p>\n<p>\u8fd9\u5c31\u662f Mask (\u63a9\u7801)\u3002<\/p>\n<hr \/>\n<h4>2. Mask \u7684\u9b54\u6cd5&#xff1a;\u4e0b\u4e09\u89d2\u77e9\u9635<\/h4>\n<p>\u5177\u4f53\u600e\u4e48\u505a\u5462&#xff1f;\u6211\u4eec\u56de\u60f3\u4e00\u4e0b Attention \u7684\u8ba1\u7b97\u516c\u5f0f&#xff1a; <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         S<\/p>\n<p>         o<\/p>\n<p>         f<\/p>\n<p>         t<\/p>\n<p>         m<\/p>\n<p>         a<\/p>\n<p>         x<\/p>\n<p>         (<\/p>\n<p>         Q<\/p>\n<p>         \u22c5<\/p>\n<p>          K<\/p>\n<p>          T<\/p>\n<p>         )<\/p>\n<p>        Softmax(Q \\\\cdot K^T)<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0576em\">S<\/span><span class=\"mord mathnormal\">o<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.1076em\">f<\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mord mathnormal\">ma<\/span><span class=\"mord mathnormal\">x<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.1413em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8913em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p><span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        Q<\/p>\n<p>        \u22c5<\/p>\n<p>         K<\/p>\n<p>         T<\/p>\n<p>       Q \\\\cdot K^T<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u22c5<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8413em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8413em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.1389em\">T<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u7b97\u51fa\u6765\u7684\u662f\u4e00\u4e2a <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        N<\/p>\n<p>        \u00d7<\/p>\n<p>        N<\/p>\n<p>       N \\\\times N<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7667em;vertical-align: -0.0833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">N<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">N<\/span><\/span><\/span><\/span><\/span> \u7684\u65b9\u9635&#xff0c;\u4ee3\u8868\u6bcf\u4e2a\u8bcd\u5bf9\u5176\u4ed6\u8bcd\u7684\u5173\u6ce8\u5ea6\u3002<\/p>\n<p>\u4e3a\u4e86\u5b9e\u65bd\u201c\u9632\u4f5c\u5f0a\u201d\u7b56\u7565&#xff0c;\u6211\u4eec\u5728\u628a\u8fd9\u4e2a\u77e9\u9635\u6254\u8fdb Softmax \u4e4b\u524d&#xff0c;\u8981\u505a\u4e2a**\u201c\u5927\u624b\u672f\u201d**&#xff1a;<\/p>\n<p>\u6211\u4eec\u628a\u77e9\u9635\u7684\u53f3\u4e0a\u89d2\u5168\u90e8\u6d82\u9ed1&#xff08;\u6216\u8005\u662f\u8d34\u4e0a\u80f6\u5e26&#xff09;\u3002 \u5728\u6570\u5b66\u4e0a&#xff0c;\u5c31\u662f\u628a\u53f3\u4e0a\u89d2\u7684\u4f4d\u7f6e\u5168\u90e8\u8d4b\u503c\u4e3a \u8d1f\u65e0\u7a77\u5927 (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         \u2212<\/p>\n<p>         \u221e<\/p>\n<p>        -\\\\infty<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6667em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">\u2212<\/span><span class=\"mord\">\u221e<\/span><\/span><\/span><\/span><\/span>)\u3002<\/p>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u9636\u68af\u6559\u5ba4\u7684\u6321\u677f<\/h5>\n<p>\u60f3\u8c61 Decoder \u662f\u4e00\u4e2a\u9636\u68af\u6559\u5ba4&#xff1a;<\/p>\n<ul>\n<li>\u5750\u5728\u7b2c 1 \u6392\u7684\u4eba&#xff08;\u7b2c 1 \u4e2a\u8bcd&#xff09;&#xff0c;\u53ea\u80fd\u770b\u89c1\u4ed6\u81ea\u5df1\u3002<\/li>\n<li>\u5750\u5728\u7b2c 2 \u6392\u7684\u4eba&#xff08;\u7b2c 2 \u4e2a\u8bcd&#xff09;&#xff0c;\u80fd\u770b\u89c1\u7b2c 1 \u6392\u548c\u81ea\u5df1\u3002<\/li>\n<li>\u5750\u5728\u7b2c 3 \u6392\u7684\u4eba&#xff0c;\u80fd\u770b\u89c1 1\u30012 \u548c\u81ea\u5df1\u3002<\/li>\n<li>\u2026<\/li>\n<li>\u4efb\u4f55\u4eba\u90fd\u7edd\u5bf9\u770b\u4e0d\u89c1\u5750\u5728\u4ed6\u540e\u9762\u7684\u4eba&#xff01;<\/li>\n<\/ul>\n<p>\u8fd9\u6837&#xff0c;I \u53ea\u80fd\u5173\u6ce8 I&#xff1b;love \u53ea\u80fd\u5173\u6ce8 I \u548c love\u3002\u672a\u6765\u7684\u8bcd study \u5bf9\u5b83\u4eec\u6765\u8bf4\u5c31\u662f\u4e00\u7247\u9ed1\u6697\u3002<\/p>\n<h5>\u274c \u4e3a\u4ec0\u4e48\u662f\u8d1f\u65e0\u7a77\u5927&#xff1f;<\/h5>\n<p>\u56e0\u4e3a Softmax \u51fd\u6570\u4f1a\u5c06\u8f93\u5165\u8f6c\u5316\u4e3a\u6982\u7387\u3002 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         e<\/p>\n<p>          \u2212<\/p>\n<p>          \u221e<\/p>\n<p>        \u2248<\/p>\n<p>        0<\/p>\n<p>       e^{-\\\\infty} \\\\approx 0<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7713em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">e<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.7713em\"><span class=\"\" style=\"top: -3.063em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">\u2212<\/span><span class=\"mord mtight\">\u221e<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2248<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">0<\/span><\/span><\/span><\/span><\/span>\u3002 \u8fd9\u6837\u4e00\u6765&#xff0c;\u6240\u6709\u53f3\u4e0a\u89d2\u7684\u6743\u91cd\u5728\u7ecf\u8fc7 Softmax \u540e\u90fd\u4f1a\u53d8\u6210 0\u3002\u8fd9\u5c31\u5728\u6570\u5b66\u4e0a\u5f7b\u5e95\u5207\u65ad\u4e86\u201c\u5077\u770b\u672a\u6765\u201d\u7684\u53ef\u80fd\u6027\u3002<\/p>\n<hr \/>\n<h4>3. \u56fe\u89e3&#xff1a;\u88ab\u906e\u6321\u7684\u89c6\u7ebf<\/h4>\n<p>\u770b\u8fd9\u4e2a\u56fe\u4e4b\u524d&#xff0c;\u8bf7\u8bb0\u4f4f&#xff1a;\u4e0b\u4e09\u89d2\u662f\u5149\u660e\u7684&#xff0c;\u4e0a\u4e09\u89d2\u662f\u9ed1\u6697\u7684\u3002<\/p>\n<p>  #mermaid-svg-AfOj2YVTeoMhLdG2{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-AfOj2YVTeoMhLdG2 .error-icon{fill:#552222;}#mermaid-svg-AfOj2YVTeoMhLdG2 .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-AfOj2YVTeoMhLdG2 .marker{fill:#333333;stroke:#333333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .marker.cross{stroke:#333333;}#mermaid-svg-AfOj2YVTeoMhLdG2 svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-AfOj2YVTeoMhLdG2 p{margin:0;}#mermaid-svg-AfOj2YVTeoMhLdG2 .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster-label text{fill:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster-label span{color:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster-label span p{background-color:transparent;}#mermaid-svg-AfOj2YVTeoMhLdG2 .label text,#mermaid-svg-AfOj2YVTeoMhLdG2 span{fill:#333;color:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .node rect,#mermaid-svg-AfOj2YVTeoMhLdG2 .node circle,#mermaid-svg-AfOj2YVTeoMhLdG2 .node ellipse,#mermaid-svg-AfOj2YVTeoMhLdG2 .node polygon,#mermaid-svg-AfOj2YVTeoMhLdG2 .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .rough-node .label text,#mermaid-svg-AfOj2YVTeoMhLdG2 .node .label text,#mermaid-svg-AfOj2YVTeoMhLdG2 .image-shape .label,#mermaid-svg-AfOj2YVTeoMhLdG2 .icon-shape .label{text-anchor:middle;}#mermaid-svg-AfOj2YVTeoMhLdG2 .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .rough-node .label,#mermaid-svg-AfOj2YVTeoMhLdG2 .node .label,#mermaid-svg-AfOj2YVTeoMhLdG2 .image-shape .label,#mermaid-svg-AfOj2YVTeoMhLdG2 .icon-shape .label{text-align:center;}#mermaid-svg-AfOj2YVTeoMhLdG2 .node.clickable{cursor:pointer;}#mermaid-svg-AfOj2YVTeoMhLdG2 .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .arrowheadPath{fill:#333333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-AfOj2YVTeoMhLdG2 .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-AfOj2YVTeoMhLdG2 .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-AfOj2YVTeoMhLdG2 .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster text{fill:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 .cluster span{color:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-AfOj2YVTeoMhLdG2 .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-AfOj2YVTeoMhLdG2 rect.text{fill:none;stroke-width:0;}#mermaid-svg-AfOj2YVTeoMhLdG2 .icon-shape,#mermaid-svg-AfOj2YVTeoMhLdG2 .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-AfOj2YVTeoMhLdG2 .icon-shape p,#mermaid-svg-AfOj2YVTeoMhLdG2 .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-AfOj2YVTeoMhLdG2 .icon-shape rect,#mermaid-svg-AfOj2YVTeoMhLdG2 .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-AfOj2YVTeoMhLdG2 .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-AfOj2YVTeoMhLdG2 .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-AfOj2YVTeoMhLdG2 :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Masked Attention \u77e9\u9635\u53ef\u89c6\u5316<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>X \u88ab\u6321\u4f4f (-inf)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>X \u88ab\u6321\u4f4f (-inf)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>X \u88ab\u6321\u4f4f (-inf)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u221a \u770b\u5f97\u89c1<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Word 1 (I)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Col 1 (I)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Col 2 (love)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Col 3 (study)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Word 2 (love)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Word 3 (study)<\/p>\n<p><\/span><\/p>\n<h4>4. Decoder \u7684\u7b2c\u4e8c\u5c42 Attention&#xff1a;\u8de8\u754c\u4e4b\u604b (Cross Attention)<\/h4>\n<p>Decoder \u6bd4 Encoder \u591a\u4e86\u4e00\u4e2a\u7ec4\u4ef6\u3002 \u5728\u641e\u5b8c\u4e0a\u9762\u90a3\u4e2a\u201c\u9632\u4f5c\u5f0a Self-Attention\u201d\u4e4b\u540e&#xff0c;Decoder \u8fd8\u9700\u8981\u505a\u4e00\u4ef6\u6700\u91cd\u8981\u7684\u4e8b&#xff1a;\u770b Encoder \u7684\u8138\u8272\u3002<\/p>\n<p>\u8fd9\u4e00\u5c42\u53eb Encoder-Decoder Attention&#xff08;\u6216\u8005\u53eb Cross Attention&#xff09;\u3002<\/p>\n<p>\u8fd9\u91cc\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>        Q<\/p>\n<p>        ,<\/p>\n<p>        K<\/p>\n<p>        ,<\/p>\n<p>        V<\/p>\n<p>       Q, K, V<\/p>\n<p>    <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span> \u6765\u6e90\u53d1\u751f\u4e86\u53d8\u5316&#xff0c;\u8fd9\u662f\u5168\u7bc7\u552f\u4e00\u7684**\u201c\u6df7\u8840\u201d**\u73af\u8282&#xff1a;<\/p>\n<ul>\n<li>Query (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          Q<\/p>\n<p>         Q<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span>)&#xff1a;\u6765\u81ea Decoder \u81ea\u5df1&#xff08;\u521a\u7b97\u5b8c\u7684\u90a3\u4e2a\u5e26\u7740 Mask \u7684\u8f93\u51fa&#xff09;\u3002 <\/p>\n<ul>\n<li>\u6f5c\u53f0\u8bcd&#xff1a;\u201c\u6211\u60f3\u5199\u4e0b\u4e00\u4e2a\u5b57\u4e86&#xff0c;\u8bf7\u95ee\u8001\u5b66\u7a76&#xff0c;\u6839\u636e\u6211\u73b0\u5728\u5199\u7684\u5185\u5bb9&#xff0c;\u539f\u6587\u91cc\u54ea\u91cc\u6700\u91cd\u8981&#xff1f;\u201d<\/li>\n<\/ul>\n<\/li>\n<li>Key (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>          K<\/p>\n<p>         K<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span>) &amp; Value (<span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          V<\/p>\n<p>         V<\/p>\n<p>      <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>)&#xff1a;\u6765\u81ea Encoder \u7684\u8f93\u51fa&#xff08;\u8001\u5b66\u7a76\u753b\u7684\u90a3\u5f20\u601d\u7ef4\u5bfc\u56fe&#xff09;\u3002 <\/p>\n<ul>\n<li>\u6f5c\u53f0\u8bcd&#xff1a;\u201c\u539f\u6587\u7684\u4fe1\u606f\u90fd\u5728\u8fd9\u513f\u4e86&#xff0c;\u4f60\u81ea\u5df1\u67e5\u5427\u3002\u201d<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h5>&#x1f393; \u751f\u52a8\u6bd4\u55bb&#xff1a;\u4f5c\u5bb6\u67e5\u8d44\u6599<\/h5>\n<li>Masked Self-Attention&#xff1a;\u4f5c\u5bb6\u770b\u7740\u81ea\u5df1\u521a\u5199\u5b8c\u7684\u4e0a\u534a\u53e5\u8bdd&#xff0c;\u6574\u7406\u601d\u8def&#xff08;\u9632\u6b62\u524d\u540e\u77db\u76fe&#xff09;\u3002<\/li>\n<li>Cross Attention&#xff1a;\u4f5c\u5bb6\u62ac\u8d77\u5934&#xff0c;\u770b\u4e86\u4e00\u773c\u8001\u5b66\u7a76\u7ed9\u7684\u201c\u539f\u6587\u5927\u7eb2\u201d&#xff0c;\u4ece\u4e2d\u63d0\u53d6\u7075\u611f&#xff0c;\u51b3\u5b9a\u4e0b\u4e00\u4e2a\u8bcd\u5199\u4ec0\u4e48\u3002<\/li>\n<hr \/>\n<h4>5. Decoder \u5b8c\u6574\u7ed3\u6784\u56fe<\/h4>\n<p>Decoder \u7684\u6bcf\u4e00\u5c42\u5176\u5b9e\u662f 3 \u4e2a\u5b50\u5c42 \u7ec4\u6210\u7684&#xff08;\u6bd4 Encoder \u591a\u4e00\u5c42&#xff09;\u3002<\/p>\n<p>  #mermaid-svg-Q8lmVcdngY3GLS2F{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-Q8lmVcdngY3GLS2F .error-icon{fill:#552222;}#mermaid-svg-Q8lmVcdngY3GLS2F .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-Q8lmVcdngY3GLS2F .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-Q8lmVcdngY3GLS2F .marker{fill:#333333;stroke:#333333;}#mermaid-svg-Q8lmVcdngY3GLS2F .marker.cross{stroke:#333333;}#mermaid-svg-Q8lmVcdngY3GLS2F svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-Q8lmVcdngY3GLS2F p{margin:0;}#mermaid-svg-Q8lmVcdngY3GLS2F .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster-label text{fill:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster-label span{color:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster-label span p{background-color:transparent;}#mermaid-svg-Q8lmVcdngY3GLS2F .label text,#mermaid-svg-Q8lmVcdngY3GLS2F span{fill:#333;color:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F .node rect,#mermaid-svg-Q8lmVcdngY3GLS2F .node circle,#mermaid-svg-Q8lmVcdngY3GLS2F .node ellipse,#mermaid-svg-Q8lmVcdngY3GLS2F .node polygon,#mermaid-svg-Q8lmVcdngY3GLS2F .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-Q8lmVcdngY3GLS2F .rough-node .label text,#mermaid-svg-Q8lmVcdngY3GLS2F .node .label text,#mermaid-svg-Q8lmVcdngY3GLS2F .image-shape .label,#mermaid-svg-Q8lmVcdngY3GLS2F .icon-shape .label{text-anchor:middle;}#mermaid-svg-Q8lmVcdngY3GLS2F .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-Q8lmVcdngY3GLS2F .rough-node .label,#mermaid-svg-Q8lmVcdngY3GLS2F .node .label,#mermaid-svg-Q8lmVcdngY3GLS2F .image-shape .label,#mermaid-svg-Q8lmVcdngY3GLS2F .icon-shape .label{text-align:center;}#mermaid-svg-Q8lmVcdngY3GLS2F .node.clickable{cursor:pointer;}#mermaid-svg-Q8lmVcdngY3GLS2F .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-Q8lmVcdngY3GLS2F .arrowheadPath{fill:#333333;}#mermaid-svg-Q8lmVcdngY3GLS2F .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-Q8lmVcdngY3GLS2F .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-Q8lmVcdngY3GLS2F .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-Q8lmVcdngY3GLS2F .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-Q8lmVcdngY3GLS2F .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-Q8lmVcdngY3GLS2F .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster text{fill:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F .cluster span{color:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-Q8lmVcdngY3GLS2F .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-Q8lmVcdngY3GLS2F rect.text{fill:none;stroke-width:0;}#mermaid-svg-Q8lmVcdngY3GLS2F .icon-shape,#mermaid-svg-Q8lmVcdngY3GLS2F .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-Q8lmVcdngY3GLS2F .icon-shape p,#mermaid-svg-Q8lmVcdngY3GLS2F .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-Q8lmVcdngY3GLS2F .icon-shape rect,#mermaid-svg-Q8lmVcdngY3GLS2F .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-Q8lmVcdngY3GLS2F .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-Q8lmVcdngY3GLS2F .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-Q8lmVcdngY3GLS2F :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Decoder Layer \u5185\u90e8\u6784\u9020<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u63d0\u4f9b Q<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/p>\n<p>\u63d0\u4f9b K, V<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Decoder \u8f93\u5165 (Shifted Output)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\">1. Masked Self-Attention  (\u53ea\u80fd\u770b\u524d\u9762&#xff0c;\u4e0d\u80fd\u770b\u540e\u9762)<\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Add &amp; Norm<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\">2. Encoder-Decoder Attention  (Q\u6765\u81ea\u4e0b\u65b9, K\/V\u6765\u81eaEncoder)<\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Encoder \u7684\u8f93\u51fa (K, V)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Add &amp; Norm<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\">3. Feed-Forward Network  (\u6d88\u5316\u601d\u8003)<\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Add &amp; Norm<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u8f93\u51fa\u5230\u4e0b\u4e00\u5c42<\/p>\n<p><\/span><\/p>\n<hr \/>\n<h4>\u672c\u7ae0\u5c0f\u7ed3<\/h4>\n<li>Decoder \u662f\u751f\u6210\u8005&#xff1a;\u5b83\u5fc5\u987b\u6309\u987a\u5e8f\u9884\u6d4b&#xff0c;\u4e0d\u80fd\u5077\u770b\u672a\u6765\u3002<\/li>\n<li>Mask \u673a\u5236&#xff1a;\u901a\u8fc7\u5728 Attention \u5206\u6570\u77e9\u9635\u7684\u4e0a\u4e09\u89d2\u586b\u5145\u8d1f\u65e0\u7a77\u5927&#xff0c;\u5728\u6570\u5b66\u4e0a\u5c4f\u853d\u4e86\u672a\u6765\u7684\u4fe1\u606f\u3002<\/li>\n<li>Cross Attention&#xff1a;\u8fd9\u662f\u8fde\u63a5 Encoder \u548c Decoder \u7684\u6865\u6881\u3002Decoder \u7528\u81ea\u5df1\u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\">\n<p>         Q<\/p>\n<p>        Q<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord mathnormal\">Q<\/span><\/span><\/span><\/span><\/span> \u53bb\u67e5 Encoder \u7684 <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         K<\/p>\n<p>        K<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><\/span><\/span><\/span><\/span> \u548c <span class=\"katex--inline\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         V<\/p>\n<p>        V<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><\/span><\/span><\/span><\/span>\u3002<\/li>\n<p>\u73b0\u5728&#xff0c;Decoder \u7ecf\u8fc7\u5c42\u5c42\u5806\u53e0&#xff0c;\u7ec8\u4e8e\u8f93\u51fa\u4e86\u4e00\u4e2a\u5411\u91cf\u3002 \u4f46\u8fd9\u4e2a\u5411\u91cf\u53ea\u662f\u4e00\u4e32\u6570\u5b57&#xff0c;\u600e\u4e48\u628a\u5b83\u53d8\u56de\u6211\u4eec\u4eba\u7c7b\u80fd\u770b\u61c2\u7684\u5355\u8bcd&#xff08;\u6bd4\u5982 study&#xff09;\u5462&#xff1f;<\/p>\n<hr \/>\n<h2>\u7b2c\u4e5d\u7ae0&#xff1a;\u7ec8\u7ae0\u2014\u2014\u8f93\u51fa\u4e0e\u672a\u6765<\/h2>\n<h4>1. \u6700\u540e\u4e00\u516c\u91cc&#xff1a;Linear &amp; Softmax<\/h4>\n<p>\u5728 Decoder \u7684\u6700\u9876\u5c42&#xff0c;\u8f93\u51fa\u7684\u4f9d\u7136\u662f\u4e00\u4e2a 512 \u7ef4&#xff08;\u6216 1024 \u7ef4&#xff09;\u7684\u5411\u91cf\u3002 \u4e3a\u4e86\u628a\u5b83\u53d8\u6210\u5b57&#xff0c;\u6211\u4eec\u9700\u8981\u7ecf\u8fc7\u6700\u540e\u4e24\u4e2a\u5173\u5361\u3002<\/p>\n<h5>\u7b2c\u4e00\u5173&#xff1a;Linear Layer&#xff08;\u5de8\u5927\u7684\u5355\u8bcd\u6295\u5f71\u4eea&#xff09;<\/h5>\n<p>\u673a\u5668\u7684\u8bcd\u6c47\u8868&#xff08;Vocabulary&#xff09;\u901a\u5e38\u5f88\u5927&#xff0c;\u6bd4\u5982\u6709 30,000 \u4e2a\u5355\u8bcd\u3002 Linear \u5c42\u5c31\u662f\u4e00\u4e2a\u8d85\u7ea7\u5de8\u5927\u7684\u5168\u8fde\u63a5\u7f51\u7edc\u3002\u5b83\u7684\u4f5c\u7528\u662f\u628a\u90a3\u4e2a 512 \u7ef4 \u7684\u5411\u91cf&#xff0c;\u77ac\u95f4\u6295\u5f71\u653e\u5927\u5230 30,000 \u7ef4\u3002<\/p>\n<ul>\n<li>\u903b\u8f91&#xff1a;\u8fd9\u4e2a 30,000 \u7ef4\u7684\u5411\u91cf\u91cc&#xff0c;\u6bcf\u4e00\u4e2a\u7ef4\u5ea6\u90fd\u5bf9\u5e94\u5b57\u5178\u91cc\u7684\u4e00\u4e2a\u8bcd\u3002<\/li>\n<li>\u7ed3\u679c&#xff1a;\u6211\u4eec\u4f1a\u5f97\u5230 30,000 \u4e2a\u6570\u503c&#xff08;Logits&#xff09;\u3002\u6570\u503c\u8d8a\u5927&#xff0c;\u4ee3\u8868\u673a\u5668\u89c9\u5f97\u4e0b\u4e2a\u8bcd\u662f\u8fd9\u4e2a\u8bcd\u7684\u53ef\u80fd\u6027\u8d8a\u9ad8\u3002<\/li>\n<\/ul>\n<h5>\u7b2c\u4e8c\u5173&#xff1a;Softmax&#xff08;\u6982\u7387\u88c1\u5224&#xff09;<\/h5>\n<p>\u8fd9 30,000 \u4e2a\u6570\u503c\u6709\u6b63\u6709\u8d1f&#xff0c;\u751a\u81f3\u8fd8\u6709\u65e0\u7a77\u5927&#xff0c;\u6ca1\u6cd5\u76f4\u63a5\u770b\u3002 Softmax \u628a\u5b83\u4eec\u8f6c\u6362\u6210\u6982\u7387\u3002<\/p>\n<ul>\n<li>\u6240\u6709\u6982\u7387\u52a0\u8d77\u6765\u7b49\u4e8e 1\u3002<\/li>\n<li>\u6bd4\u5982&#xff1a;\n<ul>\n<li>apple: 0.001%<\/li>\n<li>book: 0.002%<\/li>\n<li>study: 95%<\/li>\n<li>\u2026<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>\u6700\u7ec8&#xff0c;\u6211\u4eec\u9009\u6982\u7387\u6700\u5927\u7684\u90a3\u4e2a\u8bcd&#xff08;study&#xff09;&#xff0c;\u4f5c\u4e3a\u5f53\u524d\u7684\u8f93\u51fa\u3002<\/p>\n<hr \/>\n<h4>2. \u6d41\u7a0b\u56fe\u89e3&#xff1a;\u4ece\u6570\u5b57\u5230\u5355\u8bcd<\/h4>\n<p>\u8fd9\u662f Transformer \u751f\u4ea7\u7ebf\u4e0a\u7684\u6700\u540e\u4e00\u9053\u5de5\u5e8f&#xff1a;<\/p>\n<p>  #mermaid-svg-a58D8EoJlrftNuXd{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}@keyframes edge-animation-frame{from{stroke-dashoffset:0;}}@keyframes dash{to{stroke-dashoffset:0;}}#mermaid-svg-a58D8EoJlrftNuXd .edge-animation-slow{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 50s linear infinite;stroke-linecap:round;}#mermaid-svg-a58D8EoJlrftNuXd .edge-animation-fast{stroke-dasharray:9,5!important;stroke-dashoffset:900;animation:dash 20s linear infinite;stroke-linecap:round;}#mermaid-svg-a58D8EoJlrftNuXd .error-icon{fill:#552222;}#mermaid-svg-a58D8EoJlrftNuXd .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-a58D8EoJlrftNuXd .edge-thickness-normal{stroke-width:1px;}#mermaid-svg-a58D8EoJlrftNuXd .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-a58D8EoJlrftNuXd .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-a58D8EoJlrftNuXd .edge-thickness-invisible{stroke-width:0;fill:none;}#mermaid-svg-a58D8EoJlrftNuXd .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-a58D8EoJlrftNuXd .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-a58D8EoJlrftNuXd .marker{fill:#333333;stroke:#333333;}#mermaid-svg-a58D8EoJlrftNuXd .marker.cross{stroke:#333333;}#mermaid-svg-a58D8EoJlrftNuXd svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-a58D8EoJlrftNuXd p{margin:0;}#mermaid-svg-a58D8EoJlrftNuXd .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-a58D8EoJlrftNuXd .cluster-label text{fill:#333;}#mermaid-svg-a58D8EoJlrftNuXd .cluster-label span{color:#333;}#mermaid-svg-a58D8EoJlrftNuXd .cluster-label span p{background-color:transparent;}#mermaid-svg-a58D8EoJlrftNuXd .label text,#mermaid-svg-a58D8EoJlrftNuXd span{fill:#333;color:#333;}#mermaid-svg-a58D8EoJlrftNuXd .node rect,#mermaid-svg-a58D8EoJlrftNuXd .node circle,#mermaid-svg-a58D8EoJlrftNuXd .node ellipse,#mermaid-svg-a58D8EoJlrftNuXd .node polygon,#mermaid-svg-a58D8EoJlrftNuXd .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-a58D8EoJlrftNuXd .rough-node .label text,#mermaid-svg-a58D8EoJlrftNuXd .node .label text,#mermaid-svg-a58D8EoJlrftNuXd .image-shape .label,#mermaid-svg-a58D8EoJlrftNuXd .icon-shape .label{text-anchor:middle;}#mermaid-svg-a58D8EoJlrftNuXd .node .katex path{fill:#000;stroke:#000;stroke-width:1px;}#mermaid-svg-a58D8EoJlrftNuXd .rough-node .label,#mermaid-svg-a58D8EoJlrftNuXd .node .label,#mermaid-svg-a58D8EoJlrftNuXd .image-shape .label,#mermaid-svg-a58D8EoJlrftNuXd .icon-shape .label{text-align:center;}#mermaid-svg-a58D8EoJlrftNuXd .node.clickable{cursor:pointer;}#mermaid-svg-a58D8EoJlrftNuXd .root .anchor path{fill:#333333!important;stroke-width:0;stroke:#333333;}#mermaid-svg-a58D8EoJlrftNuXd .arrowheadPath{fill:#333333;}#mermaid-svg-a58D8EoJlrftNuXd .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-a58D8EoJlrftNuXd .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-a58D8EoJlrftNuXd .edgeLabel{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-a58D8EoJlrftNuXd .edgeLabel p{background-color:rgba(232,232,232, 0.8);}#mermaid-svg-a58D8EoJlrftNuXd .edgeLabel rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-a58D8EoJlrftNuXd .labelBkg{background-color:rgba(232, 232, 232, 0.5);}#mermaid-svg-a58D8EoJlrftNuXd .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-a58D8EoJlrftNuXd .cluster text{fill:#333;}#mermaid-svg-a58D8EoJlrftNuXd .cluster span{color:#333;}#mermaid-svg-a58D8EoJlrftNuXd div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-a58D8EoJlrftNuXd .flowchartTitleText{text-anchor:middle;font-size:18px;fill:#333;}#mermaid-svg-a58D8EoJlrftNuXd rect.text{fill:none;stroke-width:0;}#mermaid-svg-a58D8EoJlrftNuXd .icon-shape,#mermaid-svg-a58D8EoJlrftNuXd .image-shape{background-color:rgba(232,232,232, 0.8);text-align:center;}#mermaid-svg-a58D8EoJlrftNuXd .icon-shape p,#mermaid-svg-a58D8EoJlrftNuXd .image-shape p{background-color:rgba(232,232,232, 0.8);padding:2px;}#mermaid-svg-a58D8EoJlrftNuXd .icon-shape rect,#mermaid-svg-a58D8EoJlrftNuXd .image-shape rect{opacity:0.5;background-color:rgba(232,232,232, 0.8);fill:rgba(232,232,232, 0.8);}#mermaid-svg-a58D8EoJlrftNuXd .label-icon{display:inline-block;height:1em;overflow:visible;vertical-align:-0.125em;}#mermaid-svg-a58D8EoJlrftNuXd .node .label-icon path{fill:currentColor;stroke:revert;stroke-width:revert;}#mermaid-svg-a58D8EoJlrftNuXd :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Output Generation<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"edgeLabel\"><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Decoder \u6700\u7ec8\u8f93\u51fa\u5411\u91cf  (512\u7ef4)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Linear Layer  (\u5168\u8fde\u63a5\u5c42: 512 -&gt; 30,000)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Logits  (30,000 \u4e2a\u7ef4\u5ea6\u7684\u6253\u5206)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Softmax  (\u5f52\u4e00\u5316\u4e3a\u6982\u7387)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6982\u7387\u5206\u5e03  (Study: 0.9, Love: 0.05, &#8230;)<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>Argmax  \u9009\u6700\u5927\u7684<\/p>\n<p><\/span><\/p>\n<p>           <span class=\"nodeLabel\"><\/p>\n<p>\u6700\u7ec8\u5355\u8bcd: &#039;study&#039;<\/p>\n<p><\/span><\/p>\n<hr \/>\n<h4>3. \u540e\u65e5\u8c08&#xff1a;Transformer \u7684\u5bb6\u65cf\u5206\u5bb6<\/h4>\n<p>Transformer \u8bba\u6587\u53d1\u8868\u540e&#xff0c;AI \u754c\u53d1\u751f\u4e86\u5929\u7ffb\u5730\u8986\u7684\u53d8\u5316\u3002\u4f46\u6709\u8da3\u7684\u662f&#xff0c;\u540e\u6765\u7684\u5927\u4f6c\u4eec\u5f88\u5c11\u76f4\u63a5\u7528\u539f\u7248\u7684 Transformer&#xff08;Encoder-Decoder \u67b6\u6784&#xff09;&#xff0c;\u800c\u662f\u628a\u8fd9\u4e2a\u67b6\u6784\u62c6\u5f00\u6765\u7528\u4e86\u3002<\/p>\n<p>\u8fd9\u5c31\u5f62\u6210\u4e86\u73b0\u5728\u5927\u8bed\u8a00\u6a21\u578b&#xff08;LLM&#xff09;\u754c\u7684\u4e24\u5927\u95e8\u6d3e&#xff1a;<\/p>\n<h5>&#x1f535; \u7f16\u7801\u5668\u6d3e (Encoder-only)&#xff1a;BERT \u5bb6\u65cf<\/h5>\n<ul>\n<li>\u4ee3\u8868\u4eba\u7269&#xff1a;BERT, RoBERTa<\/li>\n<li>\u505a\u6cd5&#xff1a;\u53ea\u7528\u4e86 Transformer \u7684 \u5de6\u534a\u8fb9 (Encoder)\u3002<\/li>\n<li>\u7279\u957f&#xff1a;\u201c\u61c2\u4f60\u201d\u3002\u56e0\u4e3a Encoder \u53ef\u4ee5\u540c\u65f6\u770b\u5230\u4e0a\u4e0b\u6587&#xff0c;\u5b83\u662f\u641e\u9605\u8bfb\u7406\u89e3\u3001\u60c5\u611f\u5206\u6790\u3001\u6587\u672c\u5206\u7c7b\u7684\u795e\u3002\u5b83\u4e0d\u64c5\u957f\u8bf4\u8bdd&#xff0c;\u4f46\u64c5\u957f\u542c\u61c2\u4eba\u8bdd\u3002<\/li>\n<\/ul>\n<h5>&#x1f7e2; \u89e3\u7801\u5668\u6d3e (Decoder-only)&#xff1a;GPT \u5bb6\u65cf<\/h5>\n<ul>\n<li>\u4ee3\u8868\u4eba\u7269&#xff1a;GPT-3, ChatGPT, Llama, Claude<\/li>\n<li>\u505a\u6cd5&#xff1a;\u53ea\u7528\u4e86 Transformer \u7684 \u53f3\u534a\u8fb9 (Decoder)&#xff08;\u5f53\u7136&#xff0c;\u53bb\u6389\u4e86\u4e2d\u95f4\u90a3\u4e2a Cross Attention&#xff0c;\u53d8\u6210\u4e86\u7eaf\u7cb9\u7684 Autoregressive&#xff09;\u3002<\/li>\n<li>\u7279\u957f&#xff1a;\u201c\u5ffd\u60a0\u201d&#xff08;\u54e6\u4e0d&#xff0c;\u662f\u751f\u6210&#xff09;\u3002\u5b83\u5c31\u50cf\u4e00\u4e2a\u8bdd\u75e8&#xff0c;\u4f60\u770b\u4e0d\u5230\u672a\u6765&#xff0c;\u53ea\u80fd\u6839\u636e\u4e0a\u6587\u4e0d\u505c\u5730\u8e66\u51fa\u4e0b\u4e00\u4e2a\u5b57\u3002\u867d\u7136\u5b83\u4e00\u5f00\u59cb\u4e0d\u5982 BERT \u61c2\u8bed\u6cd5&#xff0c;\u4f46\u5927\u5bb6\u540e\u6765\u53d1\u73b0&#xff0c;\u53ea\u8981\u5927\u529b\u51fa\u5947\u8ff9&#xff08;\u5806\u6570\u636e\u3001\u5806\u7b97\u529b&#xff09;&#xff0c;Decoder \u5c45\u7136\u4e5f\u80fd\u6d8c\u73b0\u51fa\u60ca\u4eba\u7684\u7406\u89e3\u80fd\u529b&#xff01;<\/li>\n<\/ul>\n<p>\u51b7\u77e5\u8bc6&#xff1a;\u73b0\u5728\u7684 ChatGPT \u5176\u5b9e\u672c\u8d28\u4e0a\u5c31\u662f\u4e00\u4e2a\u5de8\u5927\u7684 Decoder\u3002\u5b83\u4e00\u76f4\u5728\u505a\u6211\u4eec\u5728\u7b2c\u516b\u7ae0\u8bb2\u7684\u4e8b\u60c5&#xff1a;\u9884\u6d4b\u4e0b\u4e00\u4e2a\u8bcd\u3002<\/p>\n<hr \/>\n<h4>4. \u7ed3\u8bed&#xff1a;Attention Is All You Need<\/h4>\n<p>\u56de\u987e\u6211\u4eec\u7684\u65c5\u7a0b&#xff0c;\u4ece\u7b2c\u4e00\u7ae0\u629b\u5f03 RNN&#xff0c;\u5230\u4e2d\u95f4\u7684 Attention \u77e9\u9635\u8fd0\u7b97&#xff0c;\u518d\u5230\u6700\u540e\u7684\u6982\u7387\u8f93\u51fa\u3002<\/p>\n<p>Transformer \u7684\u6210\u529f\u5728\u4e8e\u5b83\u505a\u5bf9\u4e86\u4e09\u4ef6\u4e8b&#xff1a;<\/p>\n<li>\u5e76\u884c\u8ba1\u7b97&#xff1a;\u89e3\u653e\u4e86 GPU \u7684\u7b97\u529b&#xff0c;\u8ba9\u5927\u89c4\u6a21\u8bad\u7ec3\u6210\u4e3a\u53ef\u80fd\u3002<\/li>\n<li>\u6ce8\u610f\u529b\u673a\u5236&#xff1a;\u8ba9\u673a\u5668\u5b66\u4f1a\u4e86\u50cf\u4eba\u4e00\u6837&#xff0c;\u4ece\u7eb7\u7e41\u590d\u6742\u7684\u4fe1\u606f\u4e2d\u6293\u4f4f\u91cd\u70b9\u3002<\/li>\n<li>\u901a\u7528\u6027&#xff1a;\u5b83\u4e0d\u4ec5\u80fd\u505a NLP&#xff0c;\u540e\u6765\u8fd8\u53bb\u641e\u4e86\u8ba1\u7b97\u673a\u89c6\u89c9&#xff08;ViT&#xff09;&#xff0c;\u751a\u81f3\u86cb\u767d\u8d28\u6298\u53e0&#xff08;AlphaFold&#xff09;\u3002<\/li>\n<p>\u90a3\u4e2a 2017 \u5e74 Google \u56e2\u961f\u60f3\u51fa\u6765\u7684\u201c\u9ed1\u76d2\u5b50\u201d&#xff0c;\u5982\u4eca\u5df2\u7ecf\u53d8\u6210\u4e86\u4eba\u5de5\u667a\u80fd\u65f6\u4ee3\u7684\u84b8\u6c7d\u673a\u3002<\/p>\n<hr \/>\n<p>\u592a\u68d2\u4e86&#xff01;\u6709\u4e86\u7406\u8bba\u57fa\u7840&#xff0c;\u518d\u770b\u4ee3\u7801\u5c31\u50cf\u770b\u8bf4\u660e\u4e66\u4e00\u6837\u7b80\u5355\u3002<\/p>\n<p>\u4e3a\u4e86\u4fdd\u6301\u98ce\u683c\u4e00\u81f4&#xff0c;\u8fd9\u4efd\u4ee3\u7801\u8bb2\u89e3\u4f9d\u7136\u4e0d\u8d70\u201c\u67af\u71e5\u6559\u79d1\u4e66\u201d\u8def\u7ebf\u3002\u6211\u4f1a\u628a PyTorch \u4ee3\u7801 \u548c\u4e4b\u524d\u7684 \u751f\u52a8\u6bd4\u55bb \u5bf9\u5e94\u8d77\u6765&#xff0c;\u5e76\u4e14\u7279\u522b\u6807\u6ce8\u51fa**\u201c\u5f62\u72b6\u53d8\u6362\u201d**&#xff08;Tensor Shapes&#xff09;&#xff0c;\u56e0\u4e3a\u505a NLP \u6700\u5934\u75bc\u7684\u5c31\u662f\u4e0d\u77e5\u9053\u77e9\u9635\u53d8\u6210\u5565\u6837\u4e86\u3002<\/p>\n<p>\u8fd9\u662f\u4e00\u4efd**\u300aTransformer \u6838\u5fc3\u7ec4\u4ef6\u00b7\u5b9e\u6218\u624b\u672f\u5200\u300b\u3002\u4f60\u53ef\u4ee5\u628a\u5b83\u4f5c\u4e3a\u535a\u5ba2\u7684\u9644\u5f55\u6216\u8005\u5355\u72ec\u7684\u6280\u672f\u5b9e\u6218\u7bc7**\u3002<\/p>\n<hr \/>\n<h2>\u9644\u5f55&#xff1a;PyTorch \u4ee3\u7801\u9010\u884c\u62c6\u89e3<\/h2>\n<p>\u4e0b\u9762\u6211\u4eec\u7528 PyTorch \u628a Transformer \u6700\u6838\u5fc3\u7684\u96f6\u4ef6\u62c6\u4e0b\u6765\u7ed9\u4f60\u770b\u3002 &#xff08;\u6ce8&#xff1a;\u4e3a\u4e86\u4ee3\u7801\u6e05\u6670&#xff0c;\u7701\u53bb\u4e86\u90e8\u5206 Dropout \u548c\u521d\u59cb\u5316\u7ec6\u8282&#xff0c;\u4e13\u6ce8\u6838\u5fc3\u903b\u8f91\u3002&#xff09;<\/p>\n<h4>0. \u51c6\u5907\u5de5\u4f5c<\/h4>\n<p><span class=\"token keyword\">import<\/span> torch<br \/>\n<span class=\"token keyword\">import<\/span> torch<span class=\"token punctuation\">.<\/span>nn <span class=\"token keyword\">as<\/span> nn<br \/>\n<span class=\"token keyword\">import<\/span> math<br \/>\n<span class=\"token keyword\">import<\/span> torch<span class=\"token punctuation\">.<\/span>nn<span class=\"token punctuation\">.<\/span>functional <span class=\"token keyword\">as<\/span> F<\/p>\n<p><span class=\"token comment\"># \u5047\u8bbe\u6211\u4eec\u6709\u4e00\u4e2a\u7b80\u5355\u7684\u914d\u7f6e<\/span><br \/>\nd_model <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">512<\/span>   <span class=\"token comment\"># \u8bcd\u5411\u91cf\u7ef4\u5ea6 (Embedding Size)<\/span><br \/>\nn_heads <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">8<\/span>     <span class=\"token comment\"># \u591a\u5934\u6ce8\u610f\u529b\u7684\u5934\u6570<\/span><br \/>\nd_ff <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2048<\/span>     <span class=\"token comment\"># FFN \u4e2d\u95f4\u5c42\u7ef4\u5ea6 (4\u500d d_model)<\/span><\/p>\n<hr \/>\n<h4>1. \u5bf9\u5e94\u7b2c\u4e09\u7ae0&#xff1a;Embedding &#043; \u4f4d\u7f6e\u7f16\u7801 (GPS)<\/h4>\n<p>\u8fd9\u662f\u6570\u636e\u7684\u5165\u53e3\u3002\u6700\u5173\u952e\u7684\u662f PositionalEncoding&#xff0c;\u5b83\u662f\u4e0d\u9700\u8981\u8bad\u7ec3\u7684&#xff08;requires_grad&#061;False&#xff09;&#xff0c;\u5b83\u53ea\u662f\u4e00\u4e2a\u56fa\u5b9a\u7684\u6570\u5b66\u6ce2\u7eb9\u3002<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">PositionalEncoding<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> max_len<span class=\"token operator\">&#061;<\/span><span class=\"token number\">5000<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span>PositionalEncoding<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 1. \u521b\u5efa\u4e00\u4e2a\u8db3\u591f\u957f\u7684\u77e9\u9635 (max_len x d_model)<\/span><br \/>\n        pe <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>zeros<span class=\"token punctuation\">(<\/span>max_len<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        position <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>arange<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> max_len<span class=\"token punctuation\">,<\/span> dtype<span class=\"token operator\">&#061;<\/span>torch<span class=\"token punctuation\">.<\/span><span class=\"token builtin\">float<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>unsqueeze<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 2. \u8ba1\u7b97\u5206\u6bcd div_term (\u90a3\u4e2a 10000^(2i\/d) \u7684\u5012\u6570)<\/span><br \/>\n        <span class=\"token comment\"># \u8fd9\u91cc\u7684\u6570\u5b66\u6280\u5de7\u662f\u4e3a\u4e86\u6570\u503c\u7a33\u5b9a\u6027<\/span><br \/>\n        div_term <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>exp<span class=\"token punctuation\">(<\/span>torch<span class=\"token punctuation\">.<\/span>arange<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span><span class=\"token builtin\">float<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">*<\/span> <span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span>math<span class=\"token punctuation\">.<\/span>log<span class=\"token punctuation\">(<\/span><span class=\"token number\">10000.0<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> d_model<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. \u586b\u5145 Sin \u548c Cos<\/span><br \/>\n        <span class=\"token comment\"># \u5076\u6570\u4f4d\u7528 Sin&#xff0c;\u5947\u6570\u4f4d\u7528 Cos<\/span><br \/>\n        pe<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>sin<span class=\"token punctuation\">(<\/span>position <span class=\"token operator\">*<\/span> div_term<span class=\"token punctuation\">)<\/span><br \/>\n        pe<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>cos<span class=\"token punctuation\">(<\/span>position <span class=\"token operator\">*<\/span> div_term<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 4. \u589e\u52a0\u4e00\u4e2a\u7ef4\u5ea6&#xff0c;\u53d8\u6210 [1, max_len, d_model]&#xff0c;\u65b9\u4fbf\u540e\u7eed\u76f4\u63a5\u548c batch \u76f8\u52a0<\/span><br \/>\n        pe <span class=\"token operator\">&#061;<\/span> pe<span class=\"token punctuation\">.<\/span>unsqueeze<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 5. \u6ce8\u518c\u4e3a buffer (\u610f\u5473\u7740\u5b83\u4e0d\u662f\u53c2\u6570&#xff0c;\u4e0d\u9700\u8981\u68af\u5ea6\u66f4\u65b0&#xff0c;\u4f46\u5728\u4fdd\u5b58\u6a21\u578b\u65f6\u4f1a\u8ddf\u7740\u8d70)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>register_buffer<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#039;pe&#039;<\/span><span class=\"token punctuation\">,<\/span> pe<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># x \u7684\u5f62\u72b6: [Batch, Seq_Len, d_model]<\/span><br \/>\n        <span class=\"token comment\"># \u76f4\u63a5\u628a\u4f4d\u7f6e\u7f16\u7801\u52a0\u5230 x \u4e0a (\u5207\u7247\u5207\u51fa\u5f53\u524d\u5e8f\u5217\u957f\u5ea6)<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> x <span class=\"token operator\">&#043;<\/span> self<span class=\"token punctuation\">.<\/span>pe<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token punctuation\">:<\/span>x<span class=\"token punctuation\">.<\/span>size<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> x<\/p>\n<p>&#x1f468;\u200d&#x1f4bb; \u8bb2\u89e3&#xff1a; \u8fd9\u4e00\u6b65\u5c31\u662f\u628a**\u201c\u84dd\u8272\u989c\u6599\u201d&#xff08;\u4f4d\u7f6e\u4fe1\u606f&#xff09;\u5012\u8fdb\u201c\u7ea2\u8272\u989c\u6599\u201d**&#xff08;\u8bcd\u5411\u91cf&#xff09;\u91cc\u3002 register_buffer \u662f PyTorch \u7684\u4e00\u4e2a\u5c0f\u6280\u5de7&#xff0c;\u544a\u8bc9\u6a21\u578b&#xff1a;\u201c\u8fd9\u662f\u4e2a\u5e38\u91cf&#xff0c;\u522b\u5728\u8fd9\u4e2a\u53d8\u91cf\u4e0a\u6d6a\u8d39\u68af\u5ea6\u8ba1\u7b97\u8d44\u6e90\u3002\u201d<\/p>\n<hr \/>\n<h4>2. \u5bf9\u5e94\u7b2c\u56db\u3001\u4e94\u7ae0&#xff1a;\u591a\u5934\u6ce8\u610f\u529b (Multi-Head Attention)<\/h4>\n<p>\u8fd9\u662f\u5168\u7bc7\u6700\u96be\u5199\u7684\u90e8\u5206\u3002\u6838\u5fc3\u96be\u70b9\u5728\u4e8e\u7ef4\u5ea6\u7684\u53d8\u6362\u3002\u6211\u4eec\u9700\u8981\u628a 512 \u5207\u6210 8 \u4e2a 64&#xff0c;\u7b97\u5b8c\u540e\u518d\u62fc\u56de\u53bb\u3002<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">MultiHeadAttention<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span>MultiHeadAttention<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>d_model <span class=\"token operator\">&#061;<\/span> d_model<br \/>\n        self<span class=\"token punctuation\">.<\/span>n_heads <span class=\"token operator\">&#061;<\/span> n_heads<br \/>\n        self<span class=\"token punctuation\">.<\/span>head_dim <span class=\"token operator\">&#061;<\/span> d_model <span class=\"token operator\">\/\/<\/span> n_heads <span class=\"token comment\"># \u6bcf\u4e2a\u5934\u7684\u7ef4\u5ea6: 512 \/ 8 &#061; 64<\/span><\/p>\n<p>        <span class=\"token keyword\">assert<\/span> self<span class=\"token punctuation\">.<\/span>head_dim <span class=\"token operator\">*<\/span> n_heads <span class=\"token operator\">&#061;&#061;<\/span> d_model<span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u7ef4\u5ea6\u5fc5\u987b\u80fd\u88ab\u5934\u6570\u6574\u9664&#xff01;&#034;<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5b9a\u4e49 W_q, W_k, W_v \u77e9\u9635<\/span><br \/>\n        <span class=\"token comment\"># \u8fd9\u91cc\u6709\u4e2a\u9a9a\u64cd\u4f5c&#xff1a;\u4e0e\u5176\u5b9a\u4e49 8 \u4e2a\u5c0f\u77e9\u9635&#xff0c;\u4e0d\u5982\u5b9a\u4e49 1 \u4e2a\u5927\u77e9\u9635&#xff0c;\u540e\u9762\u518d\u5207\u5206<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>w_q <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>w_k <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>w_v <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5b9a\u4e49\u6700\u540e\u7684\u878d\u5408\u5c42 W_o<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>fc_out <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> query<span class=\"token punctuation\">,<\/span> key<span class=\"token punctuation\">,<\/span> value<span class=\"token punctuation\">,<\/span> mask<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        batch_size <span class=\"token operator\">&#061;<\/span> query<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span><\/p>\n<p>        <span class=\"token comment\"># 1. \u7ebf\u6027\u53d8\u6362 (Linear Projection)<\/span><br \/>\n        <span class=\"token comment\"># Q, K, V \u5f62\u72b6: [Batch, Seq_Len, d_model]<\/span><br \/>\n        Q <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>w_q<span class=\"token punctuation\">(<\/span>query<span class=\"token punctuation\">)<\/span><br \/>\n        K <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>w_k<span class=\"token punctuation\">(<\/span>key<span class=\"token punctuation\">)<\/span><br \/>\n        V <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>w_v<span class=\"token punctuation\">(<\/span>value<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 2. \u5207\u5206\u591a\u5934 (Split Heads)<\/span><br \/>\n        <span class=\"token comment\"># \u8fd9\u4e00\u6b65\u628a d_model \u62c6\u6210\u4e86 n_heads * head_dim<\/span><br \/>\n        <span class=\"token comment\"># \u53d8\u6362\u540e\u5f62\u72b6: [Batch, Seq_Len, n_heads, head_dim]<\/span><br \/>\n        Q <span class=\"token operator\">&#061;<\/span> Q<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><br \/>\n        K <span class=\"token operator\">&#061;<\/span> K<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><br \/>\n        V <span class=\"token operator\">&#061;<\/span> V<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>n_heads<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. \u8c03\u6574\u7ef4\u5ea6\u987a\u5e8f (Transpose)<\/span><br \/>\n        <span class=\"token comment\"># \u4e3a\u4e86\u505a\u77e9\u9635\u4e58\u6cd5&#xff0c;\u6211\u4eec\u9700\u8981\u628a seq_len \u548c head_dim \u653e\u5728\u6700\u540e\u4e24\u7ef4<\/span><br \/>\n        <span class=\"token comment\"># \u53d8\u6362\u540e\u5f62\u72b6: [Batch, n_heads, Seq_Len, head_dim]<\/span><br \/>\n        Q <span class=\"token operator\">&#061;<\/span> Q<span class=\"token punctuation\">.<\/span>permute<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">3<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        K <span class=\"token operator\">&#061;<\/span> K<span class=\"token punctuation\">.<\/span>permute<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">3<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        V <span class=\"token operator\">&#061;<\/span> V<span class=\"token punctuation\">.<\/span>permute<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">3<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 4. \u8ba1\u7b97\u6ce8\u610f\u529b\u5206\u6570 (Scaled Dot-Product Attention)<\/span><br \/>\n        <span class=\"token comment\"># energy \u5f62\u72b6: [Batch, n_heads, Seq_Len, Seq_Len]<\/span><br \/>\n        <span class=\"token comment\"># \u8fd9\u91cc\u7684 matmul \u662f\u9488\u5bf9\u6700\u540e\u4e24\u4e2a\u7ef4\u5ea6\u7684<\/span><br \/>\n        energy <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>matmul<span class=\"token punctuation\">(<\/span>Q<span class=\"token punctuation\">,<\/span> K<span class=\"token punctuation\">.<\/span>transpose<span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> math<span class=\"token punctuation\">.<\/span>sqrt<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>head_dim<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5982\u679c\u6709 Mask (\u6bd4\u5982 Decoder \u4e0d\u80fd\u770b\u540e\u9762&#xff0c;\u6216\u8005 Padding Mask)<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> mask <span class=\"token keyword\">is<\/span> <span class=\"token keyword\">not<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token comment\"># \u628a mask \u4e3a 0 \u7684\u5730\u65b9\u586b\u6210\u8d1f\u65e0\u7a77 (-1e20)<\/span><br \/>\n            energy <span class=\"token operator\">&#061;<\/span> energy<span class=\"token punctuation\">.<\/span>masked_fill<span class=\"token punctuation\">(<\/span>mask <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1e20<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 5. Softmax \u5f52\u4e00\u5316<\/span><br \/>\n        attention <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>softmax<span class=\"token punctuation\">(<\/span>energy<span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 6. \u52a0\u6743\u6c42\u548c<\/span><br \/>\n        <span class=\"token comment\"># x \u5f62\u72b6: [Batch, n_heads, Seq_Len, head_dim]<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>matmul<span class=\"token punctuation\">(<\/span>attention<span class=\"token punctuation\">,<\/span> V<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 7. \u62fc\u63a5\u591a\u5934 (Concat)<\/span><br \/>\n        <span class=\"token comment\"># \u5148\u53d8\u56de [Batch, Seq_Len, n_heads, head_dim]<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> x<span class=\"token punctuation\">.<\/span>permute<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">3<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>contiguous<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># \u518d\u538b\u6241\u6210 [Batch, Seq_Len, d_model]<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> x<span class=\"token punctuation\">.<\/span>view<span class=\"token punctuation\">(<\/span>batch_size<span class=\"token punctuation\">,<\/span> <span class=\"token operator\">&#8211;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">.<\/span>d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 8. \u6700\u540e\u7684\u7ebf\u6027\u6df7\u5408<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>fc_out<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token keyword\">return<\/span> x<\/p>\n<p>&#x1f468;\u200d&#x1f4bb; \u8bb2\u89e3&#xff1a;<\/p>\n<ul>\n<li>view \u548c permute \u662f\u8fd9\u91cc\u7684\u7075\u9b42\u3002\u8fd9\u5c31\u50cf\u662f\u4f60\u5728\u73a9\u9b54\u65b9&#xff0c;\u4e3a\u4e86\u8ba9\u540c\u989c\u8272\u7684\u9762\u8f6c\u5230\u4e00\u8d77&#xff0c;\u4f60\u9700\u8981\u5148\u628a\u9b54\u65b9\u8f6c\u51e0\u4e2a\u89d2\u5ea6\u3002<\/li>\n<li>contiguous() \u662f\u4e2a PyTorch \u7684\u5185\u5b58\u7ba1\u7406\u7ec6\u8282&#xff0c;\u5982\u679c\u4e0d\u52a0&#xff0c;\u5728 view \u7684\u65f6\u5019\u53ef\u80fd\u4f1a\u62a5\u9519&#xff08;\u56e0\u4e3a\u5185\u5b58\u4e0d\u8fde\u7eed\u4e86&#xff09;\u3002<\/li>\n<li>masked_fill \u5c31\u662f\u90a3\u4e2a\u201c\u9636\u68af\u6559\u5ba4\u6321\u677f\u201d&#xff0c;\u628a\u4e0d\u9700\u8981\u770b\u7684\u5730\u65b9\u5f3a\u884c\u8bbe\u4e3a\u8d1f\u65e0\u7a77\u3002<\/li>\n<\/ul>\n<hr \/>\n<h4>3. \u5bf9\u5e94\u7b2c\u4e03\u7ae0&#xff1a;\u524d\u9988\u795e\u7ecf\u7f51\u7edc (FFN)<\/h4>\n<p>\u8fd9\u662f\u90a3\u4e2a\u201c\u5bbd\u8fdb\u7a84\u51fa\u201d\u7684\u4e09\u660e\u6cbb\u7ed3\u6784\u3002\u7b80\u5355\u7c97\u66b4&#xff0c;\u4f46\u53c2\u6570\u91cf\u5de8\u5927\u3002<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">FeedForward<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">,<\/span> dropout<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span>FeedForward<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token comment\"># nn.Sequential \u5c31\u50cf\u662f\u4e00\u4e2a\u628a\u5c42\u4e32\u8d77\u6765\u7684\u5bb9\u5668<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>net <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Sequential<span class=\"token punctuation\">(<\/span><br \/>\n            <span class=\"token comment\"># \u7b2c\u4e00\u5c42&#xff1a;\u81a8\u80c0 (512 -&gt; 2048)<\/span><br \/>\n            nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            <span class=\"token comment\"># \u6fc0\u6d3b\u51fd\u6570&#xff1a;ReLU (\u8fc7\u6ee4\u8d1f\u80fd\u91cf)<\/span><br \/>\n            nn<span class=\"token punctuation\">.<\/span>ReLU<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            <span class=\"token comment\"># Dropout&#xff1a;\u9632\u6b62\u8fc7\u62df\u5408<\/span><br \/>\n            nn<span class=\"token punctuation\">.<\/span>Dropout<span class=\"token punctuation\">(<\/span>dropout<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            <span class=\"token comment\"># \u7b2c\u4e8c\u5c42&#xff1a;\u6536\u7f29 (2048 -&gt; 512)<\/span><br \/>\n            nn<span class=\"token punctuation\">.<\/span>Linear<span class=\"token punctuation\">(<\/span>d_ff<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>net<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><\/p>\n<hr \/>\n<h4>4. \u5bf9\u5e94\u7b2c\u516d\u7ae0&#xff1a;Encoder Layer (\u7ec4\u88c5\u4e50\u9ad8)<\/h4>\n<p>\u628a\u4e0a\u9762\u6240\u6709\u7684\u96f6\u4ef6&#xff0c;\u7528 Add &amp; Norm \u5c01\u88c5\u8d77\u6765&#xff0c;\u5c31\u6210\u4e86 Encoder \u7684\u4e00\u5c42\u3002<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">EncoderLayer<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">,<\/span> dropout<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.1<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span>EncoderLayer<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u4e24\u4e2a\u4e3b\u8981\u7684\u5b50\u5c42<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>self_attn <span class=\"token operator\">&#061;<\/span> MultiHeadAttention<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>feed_forward <span class=\"token operator\">&#061;<\/span> FeedForward<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">,<\/span> dropout<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u4e24\u4e2a\u5f52\u4e00\u5316\u5c42 (Layer Norm)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>norm1 <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>LayerNorm<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>norm2 <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>LayerNorm<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># Dropout<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>dropout <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Dropout<span class=\"token punctuation\">(<\/span>dropout<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">,<\/span> mask<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># &#8212; \u5b50\u5c42 1: Self-Attention &#8212;<\/span><br \/>\n        <span class=\"token comment\"># 1. \u4fdd\u7559\u6b8b\u5dee (\u539f\u59cb\u8f93\u5165)<\/span><br \/>\n        residual <span class=\"token operator\">&#061;<\/span> x<\/p>\n<p>        <span class=\"token comment\"># 2. \u8ba1\u7b97 Attention<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>self_attn<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">,<\/span> mask<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. Dropout &#043; Add &#043; Norm<\/span><br \/>\n        <span class=\"token comment\"># \u6ce8\u610f&#xff1a;\u8fd9\u91cc\u5c55\u793a\u7684\u662f Post-Norm (\u8bba\u6587\u539f\u7248)&#xff0c;\u73b0\u5728\u7684 LLM \u591a\u7528 Pre-Norm<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>norm1<span class=\"token punctuation\">(<\/span>residual <span class=\"token operator\">&#043;<\/span> self<span class=\"token punctuation\">.<\/span>dropout<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># &#8212; \u5b50\u5c42 2: Feed Forward &#8212;<\/span><br \/>\n        <span class=\"token comment\"># 1. \u4fdd\u7559\u6b8b\u5dee<\/span><br \/>\n        residual <span class=\"token operator\">&#061;<\/span> x<\/p>\n<p>        <span class=\"token comment\"># 2. \u8ba1\u7b97 FFN<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>feed_forward<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 3. Dropout &#043; Add &#043; Norm<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>norm2<span class=\"token punctuation\">(<\/span>residual <span class=\"token operator\">&#043;<\/span> self<span class=\"token punctuation\">.<\/span>dropout<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token keyword\">return<\/span> x<\/p>\n<p>&#x1f468;\u200d&#x1f4bb; \u8bb2\u89e3&#xff1a; \u8fd9\u91cc\u7684\u4ee3\u7801\u7ed3\u6784\u975e\u5e38\u6e05\u6670\u5730\u5c55\u793a\u4e86 \u201cAdd &amp; Norm\u201d&#xff1a; x &#061; norm(residual &#043; sub_layer(x))\u3002 \u8fd9\u5c31\u662f\u8ba9 Transformer \u80fd\u53e0 100 \u5c42\u7684\u79d8\u8bc0\u3002<\/p>\n<hr \/>\n<h4>5. \u62fc\u88c5\u6210 Transformer Encoder<\/h4>\n<p>\u6700\u540e&#xff0c;\u6211\u4eec\u7528 nn.ModuleList \u628a 6 \u4e2a&#xff08;\u6216 N \u4e2a&#xff09;EncoderLayer \u53e0\u5728\u4e00\u8d77\u3002<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">TransformerEncoder<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">,<\/span> n_layers<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">,<\/span> vocab_size<span class=\"token punctuation\">,<\/span> max_len<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span>TransformerEncoder<span class=\"token punctuation\">,<\/span> self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u8bcd\u5d4c\u5165 &#043; \u4f4d\u7f6e\u7f16\u7801<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>embedding <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Embedding<span class=\"token punctuation\">(<\/span>vocab_size<span class=\"token punctuation\">,<\/span> d_model<span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>pos_encoding <span class=\"token operator\">&#061;<\/span> PositionalEncoding<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> max_len<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># \u5806\u53e0 N \u5c42 EncoderLayer<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>layers <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>ModuleList<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span><br \/>\n            EncoderLayer<span class=\"token punctuation\">(<\/span>d_model<span class=\"token punctuation\">,<\/span> n_heads<span class=\"token punctuation\">,<\/span> d_ff<span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span>n_layers<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>        self<span class=\"token punctuation\">.<\/span>dropout <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>Dropout<span class=\"token punctuation\">(<\/span><span class=\"token number\">0.1<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">,<\/span> mask<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># 1. \u8f93\u5165\u5904\u7406<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>embedding<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>pos_encoding<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>dropout<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token comment\"># 2. \u4e00\u5c42\u4e00\u5c42\u5f80\u4e0b\u4f20<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> layer <span class=\"token keyword\">in<\/span> self<span class=\"token punctuation\">.<\/span>layers<span class=\"token punctuation\">:<\/span><br \/>\n            x <span class=\"token operator\">&#061;<\/span> layer<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">,<\/span> mask<span class=\"token punctuation\">)<\/span><\/p>\n<p>        <span class=\"token keyword\">return<\/span> x<\/p>\n<hr \/>\n<h4>\u603b\u7ed3<\/h4>\n<p>\u770b\u5b8c\u4e86\u4ee3\u7801&#xff0c;\u662f\u4e0d\u662f\u89c9\u5f97 Transformer \u4e5f\u6ca1\u90a3\u4e48\u53ef\u6015&#xff1f; \u5b83\u672c\u8d28\u4e0a\u5c31\u662f\u4e00\u5806 Linear \u5c42&#xff08;\u77e9\u9635\u4e58\u6cd5&#xff09;&#xff0c;\u52a0\u4e0a\u5f62\u72b6\u53d8\u6362&#xff08;view\/permute&#xff09;&#xff0c;\u6700\u540e\u5957\u4e0a\u5f52\u4e00\u5316&#xff08;LayerNorm&#xff09;\u3002<\/p>\n<p>\u867d\u7136\u73b0\u5728\u7684 GPT-4 \u7b49\u6a21\u578b\u5728\u6b64\u57fa\u7840\u4e0a\u505a\u4e86\u5f88\u591a\u5fae\u5c0f\u7684\u4fee\u6539&#xff08;\u6bd4\u5982\u7528 RMSNorm \u66ff\u4ee3 LayerNorm&#xff0c;\u7528 SwiGLU \u66ff\u4ee3 ReLU&#xff0c;\u7528 RoPE \u66ff\u4ee3 Sin\/Cos&#xff09;&#xff0c;\u4f46\u9aa8\u67b6\u4f9d\u7136\u662f\u4f60\u4eca\u5929\u4eb2\u624b\u62c6\u89e3\u7684\u8fd9\u4e00\u526f\u3002<\/p>\n<p>\u73b0\u5728&#xff0c;\u6253\u5f00\u4f60\u7684 IDE&#xff0c;\u8bd5\u7740\u628a\u8fd9\u4e9b\u4ee3\u7801\u8dd1\u8d77\u6765\u5427&#xff01;<\/p>\n<hr \/>\n<h2>\u540e\u8bb0&#xff1a;\u8d70\u51fa\u9ed1\u76d2&#xff0c;\u770b\u89c1\u661f\u8fb0\u5927\u6d77<\/h2>\n<p>\u5f53\u4f60\u8bfb\u5230\u8fd9\u884c\u5b57\u7684\u65f6\u5019&#xff0c;\u606d\u559c\u4f60&#xff01;\u4f60\u5df2\u7ecf\u6210\u529f\u62c6\u89e3\u4e86\u5f53\u4eca AI \u65f6\u4ee3\u6700\u786c\u6838\u7684\u5f15\u64ce\u2014\u2014Transformer\u3002<\/p>\n<p>\u56de\u8fc7\u5934\u770b&#xff0c;\u4f60\u53ef\u80fd\u4e0d\u6562\u76f8\u4fe1&#xff0c;\u90a3\u4e2a\u73b0\u5728\u80fd\u5199\u8bd7\u3001\u80fd\u5199\u4ee3\u7801\u3001\u80fd\u966a\u4f60\u804a\u5929\u7684 ChatGPT&#xff0c;\u90a3\u4e2a\u53f7\u79f0\u62e5\u6709\u201c\u4eba\u7c7b\u667a\u6167\u201d\u7684\u5e9e\u7136\u5927\u7269&#xff0c;\u5176\u5e95\u5c42\u903b\u8f91\u7adf\u7136\u5c31\u662f\u6211\u4eec\u521a\u521a\u804a\u5b8c\u7684\u8fd9\u51e0\u6837\u4e1c\u897f&#xff1a;<\/p>\n<ul>\n<li>\u51e0\u4e2a\u77e9\u9635\u4e58\u6cd5&#xff08;\u8ba9\u8bcd\u4e0e\u8bcd\u4e4b\u95f4\u53d1\u751f\u5316\u5b66\u53cd\u5e94&#xff09;\u3002<\/li>\n<li>\u51e0\u4e2a\u6b8b\u5dee\u8fde\u63a5&#xff08;\u9632\u6b62\u697c\u76d6\u584c\u4e86&#xff09;\u3002<\/li>\n<li>\u4e00\u4e2a Softmax&#xff08;\u628a\u7ed3\u679c\u53d8\u6210\u6982\u7387&#xff09;\u3002<\/li>\n<li>\u518d\u52a0\u4e0a\u65e0\u6570\u7684\u6570\u636e\u548c\u7b97\u529b\u3002<\/li>\n<\/ul>\n<h4>1. \u5e76\u4e0d\u795e\u79d8\u7684\u201c\u9b54\u6cd5\u201d<\/h4>\n<p>\u5199\u8fd9\u7bc7\u535a\u5ba2\u6700\u5927\u7684\u521d\u8877&#xff0c;\u5c31\u662f\u60f3\u6253\u7834\u5bf9 AI \u7684\u201c\u8ff7\u4fe1\u201d\u3002<\/p>\n<p>\u4ee5\u524d\u6211\u4eec\u603b\u89c9\u5f97&#xff0c;\u8981\u8ba9\u673a\u5668\u5b66\u4f1a\u8bed\u8a00&#xff0c;\u5fc5\u987b\u6559\u5b83\u4e3b\u8c13\u5bbe\u3001\u5b9a\u72b6\u8865&#xff0c;\u6559\u5b83\u590d\u6742\u7684\u8bed\u6cd5\u6811\u3002\u4f46 Transformer \u544a\u8bc9\u6211\u4eec&#xff1a;No&#xff0c;\u4f60\u53ea\u9700\u8981\u8ba9\u5b83\u81ea\u5df1\u53bb\u201c\u770b\u201d&#xff08;Attention&#xff09;&#xff0c;\u770b\u8db3\u591f\u591a\u7684\u4e66&#xff0c;\u5b83\u81ea\u5df1\u5c31\u80fd\u6d8c\u73b0\u51fa\u7406\u89e3\u529b\u3002<\/p>\n<p>\u5b83\u6ca1\u6709\u4efb\u4f55\u4e00\u6761\u786c\u6027\u89c4\u5219\u544a\u8bc9\u5b83\u201cit \u6307\u4ee3 animal\u201d&#xff0c;\u5b83\u53ea\u662f\u901a\u8fc7\u4ebf\u4e07\u6b21\u7684\u8bad\u7ec3&#xff0c;\u7b97\u51fa\u8fd9\u4e24\u4e2a\u8bcd\u5728\u4e00\u8d77\u51fa\u73b0\u7684\u6982\u7387\u6700\u9ad8\u3002<\/p>\n<p>\u8fd9\u662f\u4e00\u79cd\u66b4\u529b\u7f8e\u5b66&#xff0c;\u4e5f\u662f\u4e00\u79cd\u7edf\u8ba1\u5b66\u7684\u80dc\u5229\u3002<\/p>\n<h4>2. \u4e3a\u4ec0\u4e48\u662f Transformer&#xff1f;<\/h4>\n<p>\u4f60\u53ef\u80fd\u4f1a\u95ee&#xff0c;\u672a\u6765\u4f1a\u6709\u6a21\u578b\u53d6\u4ee3 Transformer \u5417&#xff1f; \u7b54\u6848\u662f\u80af\u5b9a\u7684\u3002\u6bd4\u5982\u6700\u8fd1\u5f88\u706b\u7684 Mamba (SSM) \u67b6\u6784\u5c31\u5728\u6311\u6218\u5b83\u7684\u5730\u4f4d\u3002<\/p>\n<p>\u4f46 Transformer \u5728 AI \u5386\u53f2\u4e0a\u7684\u5730\u4f4d\u5c31\u50cf\u662f\u84b8\u6c7d\u673a\u3002 \u5b83\u4e4b\u6240\u4ee5\u4f1f\u5927&#xff0c;\u4e0d\u4ec5\u662f\u56e0\u4e3a\u6548\u679c\u597d&#xff0c;\u66f4\u662f\u56e0\u4e3a\u5b83\u5373\u4f7f\u7c97\u66b4\u5730\u5806\u53e0&#xff0c;\u4e5f\u80fd\u5e76\u884c\u5730\u8fd0\u8f6c&#xff08;\u8bb0\u5f97\u7b2c\u4e00\u7ae0\u7684\u629b\u5f03 RNN \u5417&#xff1f;&#xff09;\u3002\u6b63\u662f\u8fd9\u79cd\u67b6\u6784\u4e0a\u7684\u7a81\u7834&#xff0c;\u624d\u8ba9\u540e\u6765 GPT-4 \u8fd9\u79cd\u4e07\u4ebf\u53c2\u6570\u7684\u201c\u602a\u517d\u201d\u6210\u4e3a\u53ef\u80fd\u3002<\/p>\n<p>\u5b83\u5f00\u542f\u4e86 \u201cScale is all you need\u201d&#xff08;\u89c4\u6a21\u5373\u4e00\u5207&#xff09;\u7684\u65f6\u4ee3\u3002<\/p>\n<h4>3. \u4f60\u7684\u4e0b\u4e00\u7ad9<\/h4>\n<p>\u770b\u61c2\u4e86\u539f\u7406\u56fe&#xff0c;\u53ea\u662f\u4e07\u91cc\u957f\u5f81\u7b2c\u4e00\u6b65\u3002\u5982\u679c\u4f60\u60f3\u771f\u6b63\u638c\u63e1\u5b83&#xff0c;\u6211\u7ed9\u4f60\u4e09\u4e2a\u5efa\u8bae&#xff1a;<\/p>\n<li>\u624b\u64b8\u4ee3\u7801&#xff1a;\u4e0d\u8981\u5149\u770b\u535a\u5ba2\u3002\u53bb GitHub \u4e0a\u627e\u4e00\u4efd PyTorch \u5b9e\u73b0 Transformer \u7684\u4ee3\u7801&#xff08;\u6bd4\u5982\u54c8\u4f5b\u5927\u5b66NLP\u7ec4\u8457\u540d\u7684\u300aThe Annotated Transformer\u300b&#xff09;\u3002\u5f53\u4f60\u4eb2\u624b\u6572\u4e0b torch.matmul(q, k.transpose(-2, -1)) \u7684\u90a3\u4e00\u523b&#xff0c;\u4f60\u4f1a\u6709\u4e00\u79cd\u89e6\u7535\u822c\u7684\u611f\u89c9\u3002<\/li>\n<li>\u73a9\u73a9\u5fae\u8c03&#xff1a;\u53bb Hugging Face \u4e0a\u4e0b\u8f7d\u4e00\u4e2a BERT \u6216 GPT-2&#xff0c;\u7528\u4f60\u81ea\u5df1\u7684\u6570\u636e&#xff08;\u6bd4\u5982\u4f60\u7684\u5fae\u4fe1\u804a\u5929\u8bb0\u5f55&#xff09;\u5fae\u8c03\u4e00\u4e0b&#xff0c;\u770b\u770b\u5b83\u80fd\u4e0d\u80fd\u5b66\u4f1a\u4f60\u8bf4\u8bdd\u7684\u8bed\u6c14\u3002<\/li>\n<li>\u4fdd\u6301\u597d\u5947&#xff1a;AI \u9886\u57df\u4e00\u5929\u4e00\u4e2a\u6837\u3002\u4eca\u5929\u6211\u4eec\u5b66\u7684\u662f Self-Attention&#xff0c;\u660e\u5929\u53ef\u80fd\u5c31\u662f Mamba&#xff0c;\u540e\u5929\u53ef\u80fd\u5c31\u662f Q-Star\u3002\u67b6\u6784\u4f1a\u53d8&#xff0c;\u4f46**\u201c\u5982\u4f55\u628a\u73b0\u5b9e\u95ee\u9898\u8f6c\u5316\u4e3a\u6570\u5b66\u5411\u91cf\u201d**\u7684\u601d\u7ef4\u65b9\u5f0f\u4e0d\u4f1a\u53d8\u3002<\/li>\n<h4>\u5199\u5728\u6700\u540e<\/h4>\n<p>2017 \u5e74&#xff0c;8 \u4f4d Google \u5de5\u7a0b\u5e08\u53d1\u8868\u300aAttention Is All You Need\u300b\u65f6&#xff0c;\u4ed6\u4eec\u53ef\u80fd\u4e5f\u6ca1\u60f3\u5230&#xff0c;\u8fd9\u4e2a\u672c\u6765\u7528\u4e8e\u7ffb\u8bd1\u7684\u79ef\u6728\u5757&#xff0c;\u4f1a\u5728\u51e0\u5e74\u540e\u70b9\u71c3\u901a\u7528\u4eba\u5de5\u667a\u80fd&#xff08;AGI&#xff09;\u7684\u706b\u79cd\u3002<\/p>\n<p>\u6211\u4eec\u6b63\u5904\u5728\u4e00\u4e2a\u65b0\u65f6\u4ee3\u7684\u9ece\u660e\u3002\u73b0\u5728&#xff0c;\u4f60\u5df2\u7ecf\u61c2\u5f97\u4e86\u8fd9\u53f0\u201c\u5f15\u64ce\u201d\u7684\u56fe\u7eb8\u3002 \u63a5\u4e0b\u6765&#xff0c;\u8bf7\u5e26\u4e0a\u8fd9\u4e2a\u5de5\u5177&#xff0c;\u53bb\u521b\u9020\u4f60\u81ea\u5df1\u7684\u4e1c\u897f\u5427\u3002<\/p>\n<p>\u56e0\u4e3a\u5728 AI \u7684\u4e16\u754c\u91cc&#xff0c;Attention is all you need&#xff0c;\u4f46 Curiosity&#xff08;\u597d\u5947\u5fc3&#xff09;\u624d\u662f\u4f60\u552f\u4e00\u7684\u8fb9\u754c\u3002<\/p>\n<hr \/>\n<h4>\u5168\u6587\u5b8c<\/h4>\n","protected":false},"excerpt":{"rendered":"<p>\u524d\u8a00&#xff1a;\u63a8\u5f00\u90a3\u6247\u901a\u5f80\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d\u7684\u95e8<br \/>\n&#x1f32a;\ufe0f \u4e00\u573a\u9759\u6084\u6084\u7684\u9769\u547d<br \/>\n\u628a\u65f6\u9488\u62e8\u56de 2017 \u5e74\u3002<br \/>\n\u90a3\u65f6\u5019\u7684 AI \u8fd8\u5728\u8d39\u52b2\u5730\u50cf\u5c0f\u5b66\u751f\u4e00\u6837&#xff0c;\u4e00\u4e2a\u5b57\u4e00\u4e2a\u5b57\u5730\u8bfb\u8bfe\u6587&#xff08;RNN\/LSTM \u65f6\u4ee3&#xff09;\u3002\u5b83\u8bfb\u5230\u53e5\u5b50\u672b\u5c3e\u65f6&#xff0c;\u5f80\u5f80\u5df2\u7ecf\u5fd8\u4e86\u5f00\u5934\u8bb2\u4e86\u4ec0\u4e48\u3002<br \/>\n\u5c31\u5728\u90a3\u4e00\u5e74&#xff0c;Google \u7684\u51e0\u4f4d\u5de5\u7a0b\u5e08\u4e22\u51fa\u4e86\u4e00\u7bc7\u8bba\u6587\u2014\u2014\u300aAttention Is All You Need\u300b\u3002<br \/>\n\u8fd9\u7bc7\u8bba\u6587\u5c31\u50cf\u4e00\u5757\u5de8\u77f3\u7838\u8fdb\u4e86\u5e73\u9759\u7684\u6e56<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[152,841,50,86],"topic":[],"class_list":["post-64383","post","type-post","status-publish","format-standard","hentry","category-server","tag-pytorch","tag-transformer","tag-50","tag-86"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/64383.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u524d\u8a00&#xff1a;\u63a8\u5f00\u90a3\u6247\u901a\u5f80\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d\u7684\u95e8 &#x1f32a;\ufe0f \u4e00\u573a\u9759\u6084\u6084\u7684\u9769\u547d \u628a\u65f6\u9488\u62e8\u56de 2017 \u5e74\u3002 \u90a3\u65f6\u5019\u7684 AI \u8fd8\u5728\u8d39\u52b2\u5730\u50cf\u5c0f\u5b66\u751f\u4e00\u6837&#xff0c;\u4e00\u4e2a\u5b57\u4e00\u4e2a\u5b57\u5730\u8bfb\u8bfe\u6587&#xff08;RNN\/LSTM \u65f6\u4ee3&#xff09;\u3002\u5b83\u8bfb\u5230\u53e5\u5b50\u672b\u5c3e\u65f6&#xff0c;\u5f80\u5f80\u5df2\u7ecf\u5fd8\u4e86\u5f00\u5934\u8bb2\u4e86\u4ec0\u4e48\u3002 \u5c31\u5728\u90a3\u4e00\u5e74&#xff0c;Google \u7684\u51e0\u4f4d\u5de5\u7a0b\u5e08\u4e22\u51fa\u4e86\u4e00\u7bc7\u8bba\u6587\u2014\u2014\u300aAttention Is All You Need\u300b\u3002 \u8fd9\u7bc7\u8bba\u6587\u5c31\u50cf\u4e00\u5757\u5de8\u77f3\u7838\u8fdb\u4e86\u5e73\u9759\u7684\u6e56\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/64383.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-01-23T05:40:30+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"52 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/64383.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/64383.html\",\"name\":\"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-01-23T05:40:30+00:00\",\"dateModified\":\"2026-01-23T05:40:30+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/64383.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/64383.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/64383.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/64383.html","og_locale":"zh_CN","og_type":"article","og_title":"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u524d\u8a00&#xff1a;\u63a8\u5f00\u90a3\u6247\u901a\u5f80\u201c\u4e0a\u5e1d\u89c6\u89d2\u201d\u7684\u95e8 &#x1f32a;\ufe0f \u4e00\u573a\u9759\u6084\u6084\u7684\u9769\u547d \u628a\u65f6\u9488\u62e8\u56de 2017 \u5e74\u3002 \u90a3\u65f6\u5019\u7684 AI \u8fd8\u5728\u8d39\u52b2\u5730\u50cf\u5c0f\u5b66\u751f\u4e00\u6837&#xff0c;\u4e00\u4e2a\u5b57\u4e00\u4e2a\u5b57\u5730\u8bfb\u8bfe\u6587&#xff08;RNN\/LSTM \u65f6\u4ee3&#xff09;\u3002\u5b83\u8bfb\u5230\u53e5\u5b50\u672b\u5c3e\u65f6&#xff0c;\u5f80\u5f80\u5df2\u7ecf\u5fd8\u4e86\u5f00\u5934\u8bb2\u4e86\u4ec0\u4e48\u3002 \u5c31\u5728\u90a3\u4e00\u5e74&#xff0c;Google \u7684\u51e0\u4f4d\u5de5\u7a0b\u5e08\u4e22\u51fa\u4e86\u4e00\u7bc7\u8bba\u6587\u2014\u2014\u300aAttention Is All You Need\u300b\u3002 \u8fd9\u7bc7\u8bba\u6587\u5c31\u50cf\u4e00\u5757\u5de8\u77f3\u7838\u8fdb\u4e86\u5e73\u9759\u7684\u6e56","og_url":"https:\/\/www.wsisp.com\/helps\/64383.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-01-23T05:40:30+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"52 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/64383.html","url":"https:\/\/www.wsisp.com\/helps\/64383.html","name":"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-01-23T05:40:30+00:00","dateModified":"2026-01-23T05:40:30+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/64383.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/64383.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/64383.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u56fe\u89e3Transformer\u4e0ePyTorch\u5b9e\u73b0"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/64383","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=64383"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/64383\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=64383"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=64383"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=64383"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=64383"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}