{"id":70070,"date":"2026-02-01T14:17:07","date_gmt":"2026-02-01T06:17:07","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/70070.html"},"modified":"2026-02-01T14:17:07","modified_gmt":"2026-02-01T06:17:07","slug":"%e5%a4%a7%e6%a8%a1%e5%9e%8b%e6%8e%a8%e7%90%86%e5%8a%a0%e9%80%9f%e6%a0%b8%e5%bf%83%e6%8a%80%e6%9c%af%e5%ae%9e%e6%88%98%ef%bc%9akv-cache%e3%80%81%e9%87%8f%e5%8c%96%e3%80%81%e6%a8%a1%e5%9e%8b%e8%92%b8","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/70070.html","title":{"rendered":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09"},"content":{"rendered":"<h2>\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218&#xff1a;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff08;\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801&#xff09;<\/h2>\n<h3>\u6587\u6863\u6982\u8ff0<\/h3>\n<h4>\u6587\u7ae0\u6838\u5fc3\u4ef7\u503c<\/h4>\n<li>\n<p>\u6df1\u5ea6\u89e3\u6790\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u4e09\u5927\u6838\u5fc3\u6280\u672f&#xff08;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff09;\u7684\u5e95\u5c42\u539f\u7406<\/p>\n<\/li>\n<li>\n<p>\u57fa\u4e8e2025\u5e74\u6700\u65b0\u5f00\u6e90\u751f\u6001&#xff08;vLLM 0.5.0\u3001AutoGPTQ 0.7.1\u3001Transformers 4.41&#043;&#xff09;\u5b9e\u73b0\u4ee3\u7801\u843d\u5730<\/p>\n<\/li>\n<li>\n<p>\u9488\u5bf9\u4e0d\u540c\u573a\u666f&#xff08;\u672c\u5730\u90e8\u7f72\u3001API\u670d\u52a1\u3001\u8fb9\u7f18\u8bbe\u5907&#xff09;\u7ed9\u51fa\u6280\u672f\u9009\u578b\u4e0e\u6027\u80fd\u4f18\u5316\u6307\u5357<\/p>\n<\/li>\n<li>\n<p>\u63d0\u4f9b\u53ef\u76f4\u63a5\u590d\u7528\u7684\u751f\u4ea7\u7ea7\u4ee3\u7801&#xff08;\u57fa\u4e8eQwen2.5\u3001Llama3\u7b49\u4e3b\u6d41\u5f00\u6e90\u6a21\u578b&#xff09;<\/p>\n<\/li>\n<li>\n<p>\u91cf\u5316\u5404\u6280\u672f\u7684\u52a0\u901f\u6548\u679c\u3001\u663e\u5b58\u5360\u7528\u3001\u7cbe\u5ea6\u635f\u5931\u5bf9\u6bd4&#xff0c;\u6307\u5bfc\u5b9e\u9645\u843d\u5730\u51b3\u7b56<\/p>\n<\/li>\n<h4>\u5b66\u4e60\u76ee\u6807<\/h4>\n<li>\n<p>\u7406\u89e3KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\u7684\u6838\u5fc3\u539f\u7406\u4e0e\u9002\u7528\u8fb9\u754c<\/p>\n<\/li>\n<li>\n<p>\u638c\u63e1\u57fa\u4e8evLLM\u5b9e\u73b0KV Cache\u4f18\u5316\u7684\u5168\u6d41\u7a0b\u4ee3\u7801<\/p>\n<\/li>\n<li>\n<p>\u7cbe\u901aGPTQ\/AWQ\/INT4\/INT8\u91cf\u5316\u7684\u5b9e\u6218\u90e8\u7f72&#xff08;\u517c\u5bb9\u6700\u65b0\u5f00\u6e90\u6a21\u578b&#xff09;<\/p>\n<\/li>\n<li>\n<p>\u5b66\u4f1a\u4ece\u5927\u6a21\u578b&#xff08;7B&#xff09;\u84b8\u998f\u51fa\u8f7b\u91cf\u6a21\u578b&#xff08;1.8B&#xff09;\u7684\u5b8c\u6574\u6d41\u7a0b<\/p>\n<\/li>\n<li>\n<p>\u80fd\u591f\u6839\u636e\u4e1a\u52a1\u573a\u666f\u9009\u62e9\u6700\u4f18\u7684\u63a8\u7406\u52a0\u901f\u7ec4\u5408\u65b9\u6848<\/p>\n<\/li>\n<li>\n<p>\u7406\u89e3\u63a8\u7406\u52a0\u901f\u4e2d\u7684\u7cbe\u5ea6-\u6027\u80fd\u5e73\u8861\u7b56\u7565<\/p>\n<\/li>\n<h3>\u4e00\u3001\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6280\u672f\u6982\u8ff0<\/h3>\n<h4>1.1 \u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u75db\u70b9<\/h4>\n<p>\u5927\u6a21\u578b&#xff08;\u5982 7B\/14B\/70B&#xff09;\u5728\u5b9e\u9645\u751f\u4ea7\u73af\u5883\u7684\u90e8\u7f72\u4e2d&#xff0c;\u9762\u4e34\u7740\u4ece\u786c\u4ef6\u8d44\u6e90\u5230\u7528\u6237\u4f53\u9a8c\u7684\u591a\u91cd\u6311\u6218\u3002\u8fd9\u4e9b\u75db\u70b9\u4e3b\u8981\u6e90\u4e8e Transformer \u67b6\u6784\u5728\u5927\u89c4\u6a21\u53c2\u6570\u4e0b\u7684\u8ba1\u7b97\u7279\u6027\u7684\u74f6\u9888\u3002<\/p>\n<h5>1.1.1 \u663e\u5b58\u5360\u7528\u9ad8 (Memory Capacity Bottleneck)<\/h5>\n<p>\u5927\u6a21\u578b\u7684\u663e\u5b58\u5360\u7528\u4e0d\u4ec5\u4ec5\u662f\u201c\u6743\u91cd\u6587\u4ef6\u201d\u7684\u5927\u5c0f&#xff0c;\u5b83\u7531\u4ee5\u4e0b\u4e09\u90e8\u5206\u7ec4\u6210&#xff1a;<\/p>\n<ul>\n<li>\n<p>\u6a21\u578b\u6743\u91cd\u5360\u7528&#xff1a;\u4ee5 FP16 \u7cbe\u5ea6\u4e3a\u4f8b&#xff0c;\u6bcf\u4e2a\u53c2\u6570\u5360\u636e 2 \u5b57\u8282&#xff08;Bytes&#xff09;\u3002<\/p>\n<ul>\n<li>\n<p>7B \u6a21\u578b&#xff1a;\u7ea6 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>             7<\/p>\n<p>             \u00d7<\/p>\n<p>              10<\/p>\n<p>              9<\/p>\n<p>             \u00d7<\/p>\n<p>             2<\/p>\n<p>             &#061;<\/p>\n<p>             14<\/p>\n<p>             \u00a0GB<\/p>\n<p>             7 \\\\times 10^9 \\\\times 2 &#061; 14\\\\text{ GB} <\/p>\n<p>         <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.9474em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">1<\/span><span class=\"mord\"><span class=\"mord\">0<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8641em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">9<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">2<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord\">14<\/span><span class=\"mord text\"><span class=\"mord\">\u00a0GB<\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u3002<\/p>\n<\/li>\n<li>\n<p>70B \u6a21\u578b&#xff1a;\u7ea6 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>             140<\/p>\n<p>             \u00a0GB<\/p>\n<p>             140\\\\text{ GB} <\/p>\n<p>         <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord\">140<\/span><span class=\"mord text\"><span class=\"mord\">\u00a0GB<\/span><\/span><\/span><\/span><\/span><\/span><\/span> &#xff0c;\u8fd9\u8d85\u51fa\u4e86\u5355\u5f20\u9876\u7ea7\u663e\u5361&#xff08;\u5982 A100 80GB&#xff09;\u7684\u627f\u8f7d\u80fd\u529b\u3002<\/p>\n<\/li>\n<\/ul>\n<\/li>\n<li>\n<p>KV Cache \u52a8\u6001\u5360\u7528&#xff1a;\u5728\u591a\u8f6e\u5bf9\u8bdd\u4e2d&#xff0c;\u4e3a\u4e86\u907f\u514d\u91cd\u590d\u8ba1\u7b97&#xff0c;\u7cfb\u7edf\u4f1a\u7f13\u5b58\u5386\u53f2\u7684 Key \u548c Value \u5411\u91cf\u3002\u968f\u7740\u4e0a\u4e0b\u6587&#xff08;Context&#xff09;\u957f\u5ea6\u589e\u52a0&#xff0c;\u663e\u5b58\u5360\u7528\u5448\u7ebf\u6027\u589e\u957f\u3002\u5bf9\u4e8e\u957f\u6587\u672c\u5e94\u7528&#xff0c;KV Cache \u5f80\u5f80\u4f1a\u6210\u4e3a\u5bfc\u81f4\u663e\u5b58\u6ea2\u51fa&#xff08;OOM&#xff09;\u7684\u4e3b\u8981\u539f\u56e0\u3002<\/p>\n<\/li>\n<li>\n<p>\u6fc0\u6d3b\u503c\u4e0e\u4e34\u65f6\u8ba1\u7b97\u7f13\u51b2\u533a&#xff1a;\u63a8\u7406\u8fc7\u7a0b\u4e2d\u7684\u4e2d\u95f4\u5c42\u8f93\u51fa\u4e5f\u9700\u5360\u636e\u6570 GB \u7a7a\u95f4\u3002<\/p>\n<\/li>\n<\/ul>\n<h5>1.1.2 \u63a8\u7406\u901f\u5ea6\u6162 (Computation &amp; Memory Bandwidth Bottleneck)<\/h5>\n<p>\u63a8\u7406\u901f\u5ea6\u901a\u5e38\u53d7\u9650\u4e8e\u4e24\u4e2a\u622a\u7136\u4e0d\u540c\u7684\u9636\u6bb5&#xff1a;<\/p>\n<ul>\n<li>\u9884\u586b\u5145\u9636\u6bb5 (Prefill Phase)&#xff1a;\u5904\u7406\u7528\u6237\u8f93\u5165\u7684 Prompt\u3002\u8fd9\u662f\u8ba1\u7b97\u5bc6\u96c6\u578b\u7684&#xff0c;\u6a21\u578b\u9700\u8981\u4e00\u6b21\u6027\u5e76\u884c\u5904\u7406\u6240\u6709\u8f93\u5165 Token&#xff0c;\u8ba1\u7b97\u91cf\u5de8\u5927\u3002<\/li>\n<li>\u89e3\u7801\u9636\u6bb5 (Decoding Phase)&#xff1a;\u9010\u4e2a\u751f\u6210 Token\u3002\u8fd9\u662f\u5178\u578b\u7684**\u8bbf\u5b58\u5bc6\u96c6\u578b&#xff08;Memory Bound&#xff09;**\u8fc7\u7a0b\u3002\u6bcf\u751f\u6210\u4e00\u4e2a Token&#xff0c;\u90fd\u9700\u8981\u5c06\u6570\u5341 GB \u7684\u6743\u91cd\u4ece\u663e\u5b58\u8bfb\u53d6\u5230\u8ba1\u7b97\u6838\u5fc3\u4e2d\u3002\u7531\u4e8e\u201c\u8bfb\u6743\u91cd\u201d\u7684\u901f\u5ea6\u8fdc\u6162\u4e8e\u201c\u8ba1\u7b97\u201d\u901f\u5ea6&#xff0c;\u5bfc\u81f4\u751f\u6210\u901f\u5ea6\u53d7\u9650&#xff0c;\u5355\u8f6e\u54cd\u5e94\u5f80\u5f80\u8d85\u8fc7 5 \u79d2&#xff0c;\u96be\u4ee5\u5b9e\u73b0\u6d41\u7545\u7684\u5b9e\u65f6\u4ea4\u4e92\u3002<\/li>\n<\/ul>\n<h5>1.1.3 \u90e8\u7f72\u6210\u672c\u4e0e\u8fb9\u7f18\u5316\u96be\u9898 (Deployment &amp; Edge Scaling)<\/h5>\n<ul>\n<li>\u9ad8\u89c4\u683c\u786c\u4ef6\u4f9d\u8d56&#xff1a;\u4e3a\u4e86\u8fd0\u884c 70B \u6a21\u578b&#xff0c;\u4f01\u4e1a\u5f80\u5f80\u9700\u8981\u914d\u7f6e 8 \u5361 A100\/H100 \u7ec4\u6210\u7684\u8ba1\u7b97\u96c6\u7fa4&#xff0c;\u5355\u53f0\u670d\u52a1\u5668\u6210\u672c\u9ad8\u8fbe\u767e\u4e07\u7ea7\u4eba\u6c11\u5e01\u3002<\/li>\n<li>\u80fd\u6548\u6bd4\u6781\u4f4e&#xff1a;\u9ad8\u6027\u80fd GPU \u529f\u8017\u5de8\u5927&#xff0c;\u5355\u5361\u5cf0\u503c\u53ef\u8fbe 300W-700W&#xff0c;\u5bf9\u4e8e\u624b\u673a\u3001\u5e73\u677f\u6216\u5de5\u4e1a\u7269\u8054\u7f51&#xff08;IoT&#xff09;\u7b49\u8fb9\u7f18\u8bbe\u5907&#xff0c;\u9ad8\u6602\u7684\u80fd\u8017\u548c\u53d1\u70ed\u91cf\u4f7f\u5176\u65e0\u6cd5\u76f4\u63a5\u8fd0\u884c\u539f\u59cb\u5927\u6a21\u578b\u3002<\/li>\n<li>\u5e26\u5bbd\u74f6\u9888&#xff1a;\u5728\u516c\u6709\u4e91 API \u670d\u52a1\u4e2d&#xff0c;\u9ad8\u5e76\u53d1\u8bf7\u6c42\u4e0b\u663e\u5b58\u5e26\u5bbd\u7684\u4e89\u62a2\u4f1a\u5bfc\u81f4\u541e\u5410\u91cf\u6025\u5267\u4e0b\u964d&#xff0c;\u8fdb\u4e00\u6b65\u63a8\u9ad8\u4e86\u5355\u6b21\u8c03\u7528\u7684\u6210\u672c\u3002<\/li>\n<\/ul>\n<h4>1.2 \u4e09\u5927\u6838\u5fc3\u52a0\u901f\u6280\u672f\u5bf9\u6bd4<\/h4>\n<table>\n<tr>\u6280\u672f\u7c7b\u578b\u6838\u5fc3\u539f\u7406\u52a0\u901f\u6548\u679c\u663e\u5b58\u8282\u7701\u7cbe\u5ea6\u635f\u5931\u9002\u7528\u573a\u666f<\/tr>\n<tbody>\n<tr>\n<td>KV Cache<\/td>\n<td>\u7f13\u5b58\u6ce8\u610f\u529b\u673a\u5236\u7684Key\/Value\u77e9\u9635&#xff0c;\u907f\u514d\u91cd\u590d\u8ba1\u7b97<\/td>\n<td>2-5\u500d&#xff08;\u957f\u6587\u672c\u5bf9\u8bdd&#xff09;<\/td>\n<td>10-20%<\/td>\n<td>\u65e0<\/td>\n<td>\u591a\u8f6e\u5bf9\u8bdd\u3001\u957f\u6587\u672c\u751f\u6210<\/td>\n<\/tr>\n<tr>\n<td>\u91cf\u5316&#xff08;INT4\/INT8\/GPTQ&#xff09;<\/td>\n<td>\u5c06\u6a21\u578b\u6743\u91cd\u4eceFP16\/FP32\u964d\u81f3\u4f4e\u7cbe\u5ea6&#xff08;INT4\/INT8&#xff09;<\/td>\n<td>1.5-3\u500d<\/td>\n<td>50-70%<\/td>\n<td>\u8f7b\u5fae&#xff08;&lt;2%&#xff09;<\/td>\n<td>\u663e\u5b58\u53d7\u9650\u573a\u666f\u3001\u6279\u91cf\u63a8\u7406<\/td>\n<\/tr>\n<tr>\n<td>\u6a21\u578b\u84b8\u998f<\/td>\n<td>\u7528\u5927\u6a21\u578b&#xff08;\u6559\u5e08&#xff09;\u6307\u5bfc\u5c0f\u6a21\u578b&#xff08;\u5b66\u751f&#xff09;\u5b66\u4e60&#xff0c;\u4fdd\u7559\u6838\u5fc3\u80fd\u529b<\/td>\n<td>3-10\u500d<\/td>\n<td>70-90%<\/td>\n<td>\u53ef\u63a7&#xff08;&lt;5%&#xff09;<\/td>\n<td>\u8fb9\u7f18\u8bbe\u5907\u3001\u9ad8\u5e76\u53d1API\u670d\u52a1<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>1.3 \u6280\u672f\u9009\u578b\u539f\u5219<\/h4>\n<p>\u63a8\u7406\u52a0\u901f\u5e76\u975e\u5355\u4e00\u6280\u672f\u7684\u5806\u53e0&#xff0c;\u800c\u662f\u57fa\u4e8e\u786c\u4ef6\u74f6\u9888\u4e0e\u4e1a\u52a1\u6307\u6807\u7684\u6743\u8861\u6e38\u620f\u3002\u4ee5\u4e0b\u662f\u57fa\u4e8e\u663e\u5b58&#xff08;Memory&#xff09;\u3001\u5ef6\u8fdf&#xff08;Latency&#xff09;\u548c\u541e\u5410\u91cf&#xff08;Throughput&#xff09;\u7684\u7cfb\u7edf\u6027\u9009\u578b\u7b56\u7565\u3002<\/p>\n<h5>1.3.1 \u6838\u5fc3\u51b3\u7b56\u6811 (Decision Tree)<\/h5>\n<p>\u6211\u4eec\u5c06\u9009\u578b\u903b\u8f91\u53ef\u89c6\u5316\u4e3a\u4e00\u68f5\u51b3\u7b56\u6811&#xff0c;\u5e2e\u52a9\u5f00\u53d1\u8005\u5feb\u901f\u5b9a\u4f4d\u6700\u4f18\u65b9\u6848&#xff1a;<\/p>\n<h5>1.3.2 \u573a\u666f\u5316\u9009\u578b\u7b56\u7565 (Scenario-Based Strategy)<\/h5>\n<p>\u6211\u4eec\u6839\u636e\u6700\u5e38\u89c1\u7684\u8d44\u6e90\u74f6\u9888&#xff0c;\u5c06\u9009\u578b\u539f\u5219\u7ec6\u5316\u4e3a\u4ee5\u4e0b\u4e09\u4e2a\u7ef4\u5ea6&#xff1a;<\/p>\n<p>1. \u663e\u5b58\u53d7\u9650\u573a\u666f (Memory-Bound)<\/p>\n<ul>\n<li>\u75db\u70b9&#xff1a;\u6a21\u578b\u52a0\u8f7d\u540e\u663e\u5b58\u5269\u4f59\u4e0d\u8db3 20%&#xff0c;\u6216\u9891\u7e41\u89e6\u53d1 OOM&#xff08;Out of Memory&#xff09;\u3002<\/li>\n<li>\u9996\u9009\u7b56\u7565&#xff1a;\u91cf\u5316 (Quantization) \u662f\u552f\u4e00\u80fd\u663e\u8457\u964d\u4f4e\u9759\u6001\u663e\u5b58\u7684\u6280\u672f\u3002\n<ul>\n<li>\u8f7b\u5ea6\u4e0d\u8db3&#xff1a;\u4f7f\u7528 INT8 \u91cf\u5316&#xff08;\u5982 load_in_8bit&#xff09;&#xff0c;\u8282\u7701\u7ea6 50% \u663e\u5b58&#xff0c;\u7cbe\u5ea6\u51e0\u4e4e\u65e0\u635f\u3002<\/li>\n<li>\u4e25\u91cd\u4e0d\u8db3&#xff1a;\u4f7f\u7528 INT4\/GPTQ \u91cf\u5316&#xff0c;\u8282\u7701\u7ea6 70% \u663e\u5b58&#xff0c;\u9002\u5408 7B \u6a21\u578b\u8dd1\u5728 8GB \u663e\u5b58\u7684\u6d88\u8d39\u7ea7\u663e\u5361\u4e0a\u3002<\/li>\n<li>\u8865\u5145\u624b\u6bb5&#xff1a;\u542f\u7528 vLLM \u7684 swap_space&#xff0c;\u5c06\u90e8\u5206 KV Cache \u5378\u8f7d\u5230\u5185\u5b58\u4e2d&#xff0c;\u9632\u6b62\u7a81\u53d1 OOM\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>2. \u5ef6\u8fdf\/\u54cd\u5e94\u901f\u5ea6\u654f\u611f\u573a\u666f (Latency-Sensitive)<\/p>\n<ul>\n<li>\u75db\u70b9&#xff1a;\u9996\u5b57\u751f\u6210\u6162&#xff08;TTFT \u9ad8&#xff09;&#xff0c;\u6216\u6bcf\u79d2\u751f\u6210 Token \u6570&#xff08;TPS&#xff09;\u4f4e&#xff0c;\u7528\u6237\u7b49\u5f85\u611f\u5f3a\u3002<\/li>\n<li>\u9996\u9009\u7b56\u7565&#xff1a;KV Cache &#043; \u7b97\u5b50\u878d\u5408\u3002\n<ul>\n<li>\u957f\u6587\u672c\/\u591a\u8f6e\u5bf9\u8bdd&#xff1a;\u5fc5\u987b\u5f00\u542f KV Cache&#xff0c;\u907f\u514d\u968f\u4e0a\u4e0b\u6587\u589e\u957f\u800c\u5bfc\u81f4\u7684\u6307\u6570\u7ea7\u8ba1\u7b97\u91cf\u589e\u52a0\u3002<\/li>\n<li>\u6781\u81f4\u901f\u5ea6&#xff1a;\u4f7f\u7528 INT4 \u91cf\u5316\u3002\u867d\u7136\u91cf\u5316\u4e3b\u8981\u7701\u663e\u5b58&#xff0c;\u4f46\u7531\u4e8e\u51cf\u5c11\u4e86\u4ece\u663e\u5b58\u8bfb\u53d6\u6743\u91cd\u7684\u6570\u636e\u91cf&#xff08;Memory Access&#xff09;&#xff0c;\u5728\u5e26\u5bbd\u53d7\u9650\u7684 GPU \u4e0a\u80fd\u663e\u8457\u63d0\u5347\u63a8\u7406\u901f\u5ea6&#xff08;2-3\u500d&#xff09;\u3002<\/li>\n<li>\u63a8\u8350\u7ec4\u5408&#xff1a;KV Cache &#043; INT4 GPTQ &#043; FlashAttention-2&#xff08;vLLM \u9ed8\u8ba4\u96c6\u6210&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<p>3. \u8fb9\u7f18\u7aef\/\u4f4e\u7b97\u529b\u573a\u666f (Edge\/Low-Resource)<\/p>\n<ul>\n<li>\u75db\u70b9&#xff1a;\u8bbe\u5907\u7b97\u529b\u5f31&#xff08;\u5982 Jetson Orin\u3001\u624b\u673a NPU&#xff09;&#xff0c;\u65e0\u6cd5\u8fd0\u884c\u6807\u51c6 7B \u6a21\u578b\u3002<\/li>\n<li>\u9996\u9009\u7b56\u7565&#xff1a;\u6a21\u578b\u84b8\u998f (Distillation) &#043; \u6781\u81f4\u91cf\u5316\u3002\n<ul>\n<li>\u6a21\u578b\u7626\u8eab&#xff1a;\u5148\u5c06 7B\/14B \u6a21\u578b\u84b8\u998f\u4e3a 1.8B\/3B \u7684\u5c0f\u6a21\u578b&#xff0c;\u76f4\u63a5\u964d\u4f4e\u8ba1\u7b97\u91cf\u7ea7\u3002<\/li>\n<li>\u6781\u81f4\u538b\u7f29&#xff1a;\u914d\u5408 AWQ\/INT4 \u91cf\u5316&#xff0c;\u5c06\u663e\u5b58\u5360\u7528\u538b\u81f3 2GB-4GB \u533a\u95f4\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h5>1.3.3 \u7cbe\u5ea6\u4e0e\u6027\u80fd\u7684\u5e73\u8861 (Trade-off Matrix)<\/h5>\n<p>\u4e3a\u4e86\u5728\u201c\u53c8\u5feb\u53c8\u597d\u201d\u4e4b\u95f4\u627e\u5230\u5e73\u8861\u70b9&#xff0c;\u8bf7\u53c2\u8003\u4ee5\u4e0b\u7ec4\u5408\u63a8\u8350&#xff1a;<\/p>\n<table>\n<tr>\u4e1a\u52a1\u573a\u666f\u63a8\u8350\u7ec4\u5408\u65b9\u6848\u9884\u671f\u7cbe\u5ea6\u635f\u5931\u9884\u671f\u52a0\u901f\u6bd4\u5178\u578b\u914d\u7f6e\u4ee3\u7801\u53c2\u8003<\/tr>\n<tbody>\n<tr>\n<td>\u9ad8\u7cbe\u5ea6\u79d1\u7814\/\u91d1\u878d<\/td>\n<td>FP16 (\u539f\u59cb) &#043; KV Cache<\/td>\n<td>0% (\u57fa\u51c6)<\/td>\n<td>1.0x<\/td>\n<td>vllm.LLM(&#8230;, dtype&#061;&#039;float16&#039;)<\/td>\n<\/tr>\n<tr>\n<td>\u901a\u7528\u5bf9\u8bdd\/\u5ba2\u670d<\/td>\n<td>INT8 \u91cf\u5316 &#043; KV Cache (\u4e3b\u6d41\u63a8\u8350)<\/td>\n<td>&lt; 1% (\u51e0\u4e4e\u65e0\u611f)<\/td>\n<td>1.5x &#8211; 2.0x<\/td>\n<td>load_in_8bit&#061;True<\/td>\n<\/tr>\n<tr>\n<td>\u9ad8\u5e76\u53d1 API \u670d\u52a1<\/td>\n<td>INT4 GPTQ &#043; vLLM PagedAttention<\/td>\n<td>&lt; 2%<\/td>\n<td>2.0x &#8211; 3.0x<\/td>\n<td>quantization&#061;&#039;gptq&#039;<\/td>\n<\/tr>\n<tr>\n<td>\u8fb9\u7f18\u8bbe\u5907\/IoT<\/td>\n<td>\u84b8\u998f (1.8B) &#043; INT4 &#043; KV Cache<\/td>\n<td>&lt; 5% (\u7279\u5b9a\u4efb\u52a1\u53ef\u63a7)<\/td>\n<td>5.0x &#8211; 10.0x<\/td>\n<td>\u89c1\u7ae0\u8282 5.3<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>\u4e8c\u3001\u73af\u5883\u914d\u7f6e&#xff08;\u7edf\u4e00\u57fa\u7840\u73af\u5883&#xff09;<\/h3>\n<h4>2.1 \u90e8\u7f72\u73af\u5883\u8981\u6c42<\/h4>\n<table>\n<tr>\u73af\u5883\u7c7b\u578b\u786c\u4ef6\u914d\u7f6e\u8f6f\u4ef6\u4f9d\u8d56<\/tr>\n<tbody>\n<tr>\n<td>\u57fa\u7840\u6d4b\u8bd5<\/td>\n<td>CPU 8\u6838&#043;\u3001\u5185\u5b58 32GB&#043;<\/td>\n<td>Python 3.10-3.11\u3001Conda<\/td>\n<\/tr>\n<tr>\n<td>GPU\u52a0\u901f<\/td>\n<td>NVIDIA GPU&#xff08;CUDA 12.1&#043;&#xff0c;\u663e\u5b58 8GB&#043;&#xff09;<\/td>\n<td>CUDA 12.1\u3001cuDNN 8.9<\/td>\n<\/tr>\n<tr>\n<td>\u751f\u4ea7\u90e8\u7f72<\/td>\n<td>GPU&#xff08;A10\/T4\/V100\/A100&#xff09;<\/td>\n<td>Docker\u3001FastAPI\u3001vLLM<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>2.2 \u73af\u5883\u914d\u7f6e\u6b65\u9aa4<\/h4>\n<h5>2.2.1 Conda\u73af\u5883\u521b\u5efa<\/h5>\n<p># \u521b\u5efa\u4e13\u5c5e\u73af\u5883<br \/>\nconda create -n llm-inference-accel python&#061;3.10<br \/>\n# \u6fc0\u6d3b\u73af\u5883<br \/>\nconda activate llm-inference-accel<\/p>\n<h5>2.2.2 \u6838\u5fc3\u4f9d\u8d56\u5b89\u88c5<\/h5>\n<p># \u57fa\u7840\u4f9d\u8d56<br \/>\npip install torch&#061;&#061;2.5.0 torchvision&#061;&#061;0.20.0 torchaudio&#061;&#061;2.5.0 &#8211;index-url https:\/\/download.pytorch.org\/whl\/cu121<br \/>\npip install transformers&#061;&#061;4.41.2 accelerate&#061;&#061;0.31.0 datasets&#061;&#061;2.20.0<\/p>\n<p># KV Cache\u4f18\u5316&#xff1a;vLLM&#xff08;\u6700\u65b0\u7a33\u5b9a\u7248&#xff09;<br \/>\npip install vllm&#061;&#061;0.5.0<\/p>\n<p># \u91cf\u5316\u4f9d\u8d56<br \/>\npip install auto-gptq&#061;&#061;0.7.1 bitsandbytes&#061;&#061;0.43.1<br \/>\npip install optimum&#061;&#061;1.20.0 trl&#061;&#061;0.8.6<\/p>\n<p># \u84b8\u998f\u4f9d\u8d56<br \/>\npip install peft&#061;&#061;0.11.1 sentencepiece&#061;&#061;0.1.99<\/p>\n<p># \u8f85\u52a9\u5de5\u5177<br \/>\npip install modelscope&#061;&#061;1.15.0 python-dotenv&#061;&#061;1.0.1 psutil&#061;&#061;5.9.8<\/p>\n<h3>\u4e09\u3001KV Cache \u4f18\u5316\u5b9e\u6218&#xff08;\u57fa\u4e8evLLM&#xff09;<\/h3>\n<h4>3.1 KV Cache \u6838\u5fc3\u539f\u7406&#xff1a;\u4ee5\u7a7a\u95f4\u6362\u65f6\u95f4<\/h4>\n<p>\u8fd9\u662f\u4e3a\u60a8\u6df1\u5ea6\u4fee\u8ba2\u7684\u201c3.1 KV Cache \u6838\u5fc3\u539f\u7406\u201d\u4e0e\u201c3.2 \u9002\u7528\u573a\u666f\u201d\u90e8\u5206\u3002<\/p>\n<p>\u6b64\u6b21\u4fee\u6539\u7684\u6838\u5fc3\u5728\u4e8e&#xff1a;<\/p>\n<li>\u6df1\u5316\u539f\u7406&#xff1a;\u4ece\u201c\u7a7a\u95f4\u6362\u65f6\u95f4\u201d\u7684\u672c\u8d28\u51fa\u53d1&#xff0c;\u89e3\u6790\u4e86 Self-Attention \u7684\u8ba1\u7b97\u5197\u4f59\u3002<\/li>\n<li>\u589e\u52a0\u53ef\u89c6\u5316&#xff1a;\u6dfb\u52a0\u4e86 KV Cache \u8fd0\u4f5c\u673a\u5236\u6811\u5f62\u56fe&#xff0c;\u76f4\u89c2\u5c55\u793a\u6570\u636e\u6d41\u5411\u3002<\/li>\n<li>\u65b0\u589e\u5173\u952e\u786c\u6838\u77e5\u8bc6&#xff1a;\u8865\u5145\u4e86 KV Cache \u663e\u5b58\u8ba1\u7b97\u516c\u5f0f&#xff0c;\u8fd9\u662f\u5b9e\u9645\u90e8\u7f72\u4e2d\u8ba1\u7b97 GPU \u8d44\u6e90\u9700\u6c42\u7684\u5173\u952e\u4f9d\u636e&#xff08;\u60a8\u89c9\u5f97\u6709\u5fc5\u8981\u6dfb\u52a0\u7684\u5185\u5bb9&#xff09;\u3002<\/li>\n<li>PagedAttention \u7c7b\u6bd4&#xff1a;\u7528\u201c\u64cd\u4f5c\u7cfb\u7edf\u865a\u62df\u5185\u5b58\u201d\u7684\u7ecf\u5178\u7c7b\u6bd4\u89e3\u91ca vLLM \u7684\u6838\u5fc3\u4f18\u52bf\u3002<\/li>\n<hr \/>\n<h3>\u4e09\u3001 KV Cache \u4f18\u5316\u5b9e\u6218&#xff08;\u57fa\u4e8e vLLM&#xff09;<\/h3>\n<h4>3.1 KV Cache \u6838\u5fc3\u539f\u7406&#xff1a;\u4ee5\u7a7a\u95f4\u6362\u65f6\u95f4<\/h4>\n<p>KV Cache&#xff08;Key-Value Cache&#xff09;\u662f\u5927\u6a21\u578b\u63a8\u7406\u6027\u80fd\u4f18\u5316\u7684\u201c\u57fa\u77f3\u201d\u3002\u5728 Transformer \u67b6\u6784\u7684\u81ea\u6ce8\u610f\u529b\u673a\u5236&#xff08;Self-Attention&#xff09;\u4e2d&#xff0c;\u751f\u6210\u6bcf\u4e00\u4e2a\u65b0 Token \u65f6&#xff0c;\u90fd\u9700\u8981\u8ba1\u7b97\u5b83\u4e0e\u4e4b\u524d\u6240\u6709 Token \u7684\u76f8\u4e92\u5173\u7cfb&#xff08;Attention Score&#xff09;\u3002<\/p>\n<h5>3.1.1 \u8fd0\u4f5c\u673a\u5236\u89e3\u6790<\/h5>\n<ul>\n<li>\n<p>\u65e0 KV Cache&#xff08;\u8ba1\u7b97\u5197\u4f59&#xff09;&#xff1a;<\/p>\n<p>\u6bcf\u751f\u6210\u7b2c N\u4e2a Token&#xff0c;\u6a21\u578b\u9700\u8981\u91cd\u65b0\u8ba1\u7b97\u524d N-1 \u4e2a Token \u7684 Key \u548c Value \u77e9\u9635\u3002\u8fd9\u610f\u5473\u7740\u5386\u53f2\u4fe1\u606f\u88ab\u53cd\u590d\u8ba1\u7b97&#xff0c;\u8ba1\u7b97\u91cf\u968f\u5e8f\u5217\u957f\u5ea6\u5448 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           O<\/p>\n<p>           (<\/p>\n<p>            N<\/p>\n<p>            2<\/p>\n<p>           )<\/p>\n<p>           O(N^2) <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1.1141em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">O<\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">N<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8641em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/span> \u589e\u957f\u3002<\/p>\n<\/li>\n<li>\n<p>\u6709 KV Cache&#xff08;\u589e\u91cf\u8ba1\u7b97&#xff09;&#xff1a;<\/p>\n<p>\u7cfb\u7edf\u5c06\u5386\u53f2 Token \u8ba1\u7b97\u8fc7\u7684 Key \u548c Value \u77e9\u9635\u9a7b\u7559\u5728\u663e\u5b58\u4e2d\u3002\u751f\u6210\u7b2c N\u4e2a Token \u65f6&#xff0c;\u53ea\u9700\u8ba1\u7b97\u5f53\u524d Token \u7684 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>            K<\/p>\n<p>            N<\/p>\n<p>           ,<\/p>\n<p>            V<\/p>\n<p>            N<\/p>\n<p>           K_N, V_N <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8778em;vertical-align: -0.1944em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0715em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.109em\">N<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.2222em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.109em\">N<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> &#xff0c;\u7136\u540e\u4e0e\u7f13\u5b58\u4e2d\u7684\u5386\u53f2 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>            K<\/p>\n<p>             1<\/p>\n<p>             :<\/p>\n<p>             N<\/p>\n<p>             \u2212<\/p>\n<p>             1<\/p>\n<p>           ,<\/p>\n<p>            V<\/p>\n<p>             1<\/p>\n<p>             :<\/p>\n<p>             N<\/p>\n<p>             \u2212<\/p>\n<p>             1<\/p>\n<p>           K_{1:N-1}, V_{1:N-1} <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.8917em;vertical-align: -0.2083em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0715em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">1<\/span><span class=\"mrel mtight\">:<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.109em\">N<\/span><span class=\"mbin mtight\">\u2212<\/span><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2083em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.2222em\">V<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3283em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.2222em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\"><span class=\"mord mtight\">1<\/span><span class=\"mrel mtight\">:<\/span><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.109em\">N<\/span><span class=\"mbin mtight\">\u2212<\/span><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2083em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> \u62fc\u63a5\u3002\u8ba1\u7b97\u91cf\u964d\u4f4e\u4e3a\u7ebf\u6027\u7684 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           O<\/p>\n<p>           (<\/p>\n<p>           N<\/p>\n<p>           )<\/p>\n<p>           O(N) <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0278em\">O<\/span><span class=\"mopen\">(<\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.109em\">N<\/span><span class=\"mclose\">)<\/span><\/span><\/span><\/span><\/span><\/span> \u3002<\/p>\n<\/li>\n<\/ul>\n<h5>3.1.2 \u6838\u5fc3\u673a\u5236\u4e0e\u4f18\u5316\u67b6\u6784\u56fe (Tree Structure)<\/h5>\n<p>\u4ee3\u7801\u6bb5<\/p>\n<p>graph TD<br \/>\n    A<span class=\"token punctuation\">[<\/span>\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f: KV Cache<span class=\"token punctuation\">]<\/span> &#8212;<span class=\"token operator\">&gt;<\/span> B<span class=\"token punctuation\">(<\/span>\u6838\u5fc3\u539f\u7406: \u7a7a\u95f4\u6362\u65f6\u95f4<span class=\"token punctuation\">)<\/span><br \/>\n    B &#8212;<span class=\"token operator\">&gt;<\/span> B1<span class=\"token punctuation\">[<\/span>Prefill\u9636\u6bb5: \u5e76\u884c\u8ba1\u7b97, \u586b\u5145Cache<span class=\"token punctuation\">]<\/span><br \/>\n    B &#8212;<span class=\"token operator\">&gt;<\/span> B2<span class=\"token punctuation\">[<\/span>Decode\u9636\u6bb5: \u4e32\u884c\u751f\u6210, \u8bfb\u53d6Cache<span class=\"token punctuation\">]<\/span><\/p>\n<p>    A &#8212;<span class=\"token operator\">&gt;<\/span> C<span class=\"token punctuation\">{<\/span>\u5185\u5b58\u74f6\u9888\u6311\u6218<span class=\"token punctuation\">}<\/span><br \/>\n    C &#8212;<span class=\"token operator\">&gt;<\/span> C1<span class=\"token punctuation\">[<\/span>\u663e\u5b58\u5360\u7528\u968f\u957f\u5ea6\u7ebf\u6027\u589e\u957f<span class=\"token punctuation\">]<\/span><br \/>\n    C &#8212;<span class=\"token operator\">&gt;<\/span> C2<span class=\"token punctuation\">[<\/span>\u663e\u5b58\u788e\u7247\u5316\u95ee\u9898<span class=\"token punctuation\">]<\/span><\/p>\n<p>    A &#8212;<span class=\"token operator\">&gt;<\/span> D<span class=\"token punctuation\">[<\/span>\u89e3\u51b3\u65b9\u6848: vLLM PagedAttention<span class=\"token punctuation\">]<\/span><br \/>\n    D &#8212;<span class=\"token operator\">&gt;<\/span> D1<span class=\"token punctuation\">[<\/span>\u6838\u5fc3\u601d\u60f3: \u64cd\u4f5c\u7cfb\u7edf\u5206\u9875\u5185\u5b58\u7ba1\u7406<span class=\"token punctuation\">]<\/span><br \/>\n    D &#8212;<span class=\"token operator\">&gt;<\/span> D2<span class=\"token punctuation\">[<\/span>Block Table: \u6620\u5c04\u903b\u8f91\u9875\u5230\u7269\u7406\u9875<span class=\"token punctuation\">]<\/span><br \/>\n    D &#8212;<span class=\"token operator\">&gt;<\/span> D3<span class=\"token punctuation\">[<\/span>\u4f18\u52bf: \u663e\u5b58\u5229\u7528\u7387\u63a5\u8fd1100%<span class=\"token punctuation\">]<\/span><\/p>\n<p>\u3010KV Cache \u6838\u5fc3\u673a\u5236\u4e0e\u4f18\u5316\u903b\u8f91\u6811\u3011<\/p>\n<ul>\n<li>1. \u57fa\u7840\u539f\u7406 (Mechanism)\n<ul>\n<li>Prefill \u9636\u6bb5 (\u9884\u586b\u5145)&#xff1a;\u4e00\u6b21\u6027\u8ba1\u7b97 Prompt \u4e2d\u6240\u6709 Token \u7684 KV&#xff0c;\u5b58\u5165\u663e\u5b58\u3002<\/li>\n<li>Decode \u9636\u6bb5 (\u89e3\u7801)&#xff1a;\u6bcf\u6b65\u4ec5\u8ba1\u7b97 1 \u4e2a\u65b0 Token \u7684 KV&#xff0c;\u8ffd\u52a0\u5230\u7f13\u5b58\u672b\u5c3e\u3002<\/li>\n<li>\u672c\u8d28&#xff1a;\u901a\u8fc7\u6d88\u8017\u663e\u5b58&#xff08;Memory&#xff09;\u6765\u6d88\u9664\u5197\u4f59\u8ba1\u7b97&#xff08;Compute&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<li>2. \u9762\u4e34\u6311\u6218 (Bottleneck)\n<ul>\n<li>\u52a8\u6001\u589e\u957f&#xff1a;Cache \u5927\u5c0f\u968f\u5bf9\u8bdd\u957f\u5ea6\u4e0d\u53ef\u9884\u6d4b\u5730\u589e\u957f\u3002<\/li>\n<li>\u663e\u5b58\u788e\u7247&#xff1a;\u4f20\u7edf\u6846\u67b6\u8981\u6c42 KV Cache \u5360\u7528\u8fde\u7eed\u663e\u5b58&#xff0c;\u5bfc\u81f4\u5927\u91cf\u788e\u7247\u6d6a\u8d39\u3002<\/li>\n<\/ul>\n<\/li>\n<li>3. \u8fdb\u9636\u4f18\u5316 (Optimization: PagedAttention)\n<ul>\n<li>\u6838\u5fc3\u521b\u65b0&#xff1a;\u5f15\u5165\u64cd\u4f5c\u7cfb\u7edf\u7684\u201c\u865a\u62df\u5185\u5b58\u201d\u6982\u5ff5\u3002<\/li>\n<li>Block \u7ba1\u7406&#xff1a;\u5c06 KV Cache \u5207\u5206\u4e3a\u56fa\u5b9a\u5927\u5c0f\u7684\u5757&#xff08;Block&#xff09;&#xff0c;\u975e\u8fde\u7eed\u5b58\u50a8\u3002<\/li>\n<li>\u6548\u679c&#xff1a;\u663e\u5b58\u6d6a\u8d39\u7387\u4ece 20%-50% \u964d\u81f3 &lt;4%&#xff0c;\u541e\u5410\u91cf\u63d0\u5347 2-4 \u500d\u3002<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n<h5>3.1.3 KV Cache \u663e\u5b58\u5360\u7528\u4f30\u7b97\u516c\u5f0f<\/h5>\n<p>\u5728\u5b9e\u9645\u90e8\u7f72\u4e2d&#xff0c;\u4e86\u89e3 KV Cache \u5360\u7528\u591a\u5c11\u663e\u5b58\u81f3\u5173\u91cd\u8981\u3002\u8fd9\u51b3\u5b9a\u4e86\u4f60\u7684\u663e\u5361\u80fd\u652f\u6301\u591a\u957f\u7684\u4e0a\u4e0b\u6587\u3002<\/p>\n<p>\u516c\u5f0f&#xff1a; <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         Memory\u00a0(Bytes)<\/p>\n<p>         &#061;<\/p>\n<p>         2<\/p>\n<p>         \u00d7<\/p>\n<p>         layers<\/p>\n<p>         \u00d7<\/p>\n<p>         hidden_size<\/p>\n<p>         \u00d7<\/p>\n<p>         seq_len<\/p>\n<p>         \u00d7<\/p>\n<p>         batch_size<\/p>\n<p>         \u00d7<\/p>\n<p>         dtype_size<\/p>\n<p>         \\\\text{Memory (Bytes)} &#061; 2 \\\\times \\\\text{layers} \\\\times \\\\text{hidden\\\\_size} \\\\times \\\\text{seq\\\\_len} \\\\times \\\\text{batch\\\\_size} \\\\times \\\\text{dtype\\\\_size} <\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord text\"><span class=\"mord\">Memory\u00a0(Bytes)<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">2<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.8889em;vertical-align: -0.1944em\"><\/span><span class=\"mord text\"><span class=\"mord\">layers<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0044em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">hidden_size<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0044em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">seq_len<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0044em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">batch_size<\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.0044em;vertical-align: -0.31em\"><\/span><span class=\"mord text\"><span class=\"mord\">dtype_size<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<ul>\n<li>2&#xff1a;\u5206\u522b\u4ee3\u8868 Key \u548c Value \u4e24\u4e2a\u77e9\u9635\u3002<\/li>\n<li>dtype_size&#xff1a;FP16 \u4e3a 2 Bytes&#xff0c;FP32 \u4e3a 4 Bytes\u3002<\/li>\n<\/ul>\n<p>\u5b9e\u6218\u793a\u4f8b&#xff08;Llama-2-7B&#xff0c;FP16&#xff09;&#xff1a;<\/p>\n<ul>\n<li>\n<p>Layers &#061; 32, Hidden Size &#061; 4096<\/p>\n<\/li>\n<li>\n<p>\u5bf9\u4e8e 1 \u4e2a\u8bf7\u6c42&#xff0c;\u4e0a\u4e0b\u6587\u957f\u5ea6\u8fbe\u5230 1024 \u65f6&#xff1a; <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           2<\/p>\n<p>           \u00d7<\/p>\n<p>           32<\/p>\n<p>           \u00d7<\/p>\n<p>           4096<\/p>\n<p>           \u00d7<\/p>\n<p>           1024<\/p>\n<p>           \u00d7<\/p>\n<p>           1<\/p>\n<p>           \u00d7<\/p>\n<p>           2<\/p>\n<p>           \u2248<\/p>\n<p>           512<\/p>\n<p>           \u00a0MB<\/p>\n<p>           2 \\\\times 32 \\\\times 4096 \\\\times 1024 \\\\times 1 \\\\times 2 \\\\approx 512 \\\\text{ MB} <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">2<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">32<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">4096<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">1024<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">1<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6444em\"><\/span><span class=\"mord\">2<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">\u2248<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord\">512<\/span><span class=\"mord text\"><span class=\"mord\">\u00a0MB<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<\/li>\n<li>\n<p>\u7ed3\u8bba&#xff1a;\u4ec5 1K \u4e0a\u4e0b\u6587\u7684\u4e00\u4e2a\u5e76\u53d1\u5c31\u5360\u7528 0.5GB \u663e\u5b58\u3002\u5982\u679c\u662f 64 \u5e76\u53d1&#xff0c;\u4ec5 KV Cache \u5c31\u9700\u8981 32GB \u663e\u5b58&#xff01;\u8fd9\u5c31\u662f\u4e3a\u4ec0\u4e48**\u663e\u5b58\u4f18\u5316&#xff08;\u5982 PagedAttention&#xff09;**\u5982\u6b64\u91cd\u8981\u7684\u539f\u56e0\u3002<\/p>\n<\/li>\n<\/ul>\n<hr \/>\n<h4>3.2 \u9002\u7528\u573a\u666f\u4e0e\u9608\u503c\u6307\u5357<\/h4>\n<p>KV Cache \u662f\u73b0\u4ee3\u5927\u6a21\u578b\u63a8\u7406\u7684\u201c\u6807\u914d\u201d&#xff0c;\u4f46\u5728\u4ee5\u4e0b\u573a\u666f\u4e2d\u5176\u6536\u76ca\u6700\u4e3a\u5173\u952e\u3002<\/p>\n<h5>3.2.1 \u51b3\u7b56\u6811\u72b6\u56fe<\/h5>\n<p>\u662f\u5426\u5f00\u542f KV Cache?<br \/>\n\u251c\u2500\u2500 <span class=\"token number\">1<\/span>. \u4e0a\u4e0b\u6587\u957f\u5ea6\u5206\u6790<br \/>\n\u2502   \u251c\u2500\u2500 \u77ed\u6587\u672c <span class=\"token punctuation\">(<\/span><span class=\"token operator\">&lt;<\/span> <span class=\"token number\">128<\/span> tokens<span class=\"token punctuation\">)<\/span>: \u6536\u76ca\u4e0d\u660e\u663e&#xff0c;\u6b64\u65f6\u4e3b\u8981\u74f6\u9888\u5728\u6a21\u578b\u6743\u91cd\u52a0\u8f7d<br \/>\n\u2502   \u2514\u2500\u2500 \u957f\u6587\u672c <span class=\"token punctuation\">(<\/span><span class=\"token operator\">&gt;<\/span> <span class=\"token number\">512<\/span> tokens<span class=\"token punctuation\">)<\/span>: \u3010\u5fc5\u987b\u5f00\u542f\u3011&#xff0c;\u5426\u5219\u5ef6\u8fdf\u5c06\u5448\u6307\u6570\u7ea7\u4e0a\u5347<br \/>\n\u251c\u2500\u2500 <span class=\"token number\">2<\/span>. \u4ea4\u4e92\u6a21\u5f0f\u5206\u6790<br \/>\n\u2502   \u251c\u2500\u2500 \u5355\u8f6e\u95ee\u7b54: \u63a8\u8350\u5f00\u542f<br \/>\n\u2502   \u2514\u2500\u2500 \u591a\u8f6e\u5bf9\u8bdd: \u3010\u6838\u5fc3\u573a\u666f\u3011&#xff0c;\u5fc5\u987b\u7f13\u5b58\u5386\u53f2\u5bf9\u8bdd\u7684 KV&#xff0c;\u5426\u5219\u6bcf\u8f6e\u90fd\u8981\u91cd\u7b97\u65e7\u5386\u53f2<br \/>\n\u2514\u2500\u2500 <span class=\"token number\">3<\/span>. \u4e1a\u52a1\u5f62\u6001\u5206\u6790<br \/>\n    \u251c\u2500\u2500 \u79bb\u7ebf\u6279\u5904\u7406: \u914d\u5408 PagedAttention \u63d0\u5347 Batch Size<br \/>\n    \u2514\u2500\u2500 \u5b9e\u65f6\u6d41\u5f0f\u8f93\u51fa: \u5fc5\u987b\u5f00\u542f&#xff0c;\u4fdd\u8bc1 Token \u751f\u6210\u901f\u5ea6\u5e73\u7a33<\/p>\n<h5>3.2.2 \u8be6\u7ec6\u573a\u666f\u8bf4\u660e<\/h5>\n<li>\u591a\u8f6e\u5bf9\u8bdd&#xff08;Multi-Turn Conversation&#xff09;\n<ul>\n<li>\u7279\u5f81&#xff1a;\u7528\u6237\u4e0e AI \u53cd\u590d\u4ea4\u4e92&#xff0c;History \u8d8a\u6765\u8d8a\u957f\u3002<\/li>\n<li>\u4ef7\u503c&#xff1a;\u907f\u514d\u5728\u7b2c 10 \u8f6e\u5bf9\u8bdd\u65f6&#xff0c;\u8fd8\u8981\u91cd\u65b0\u8ba1\u7b97\u524d 9 \u8f6e\u7684\u6570\u767e\u4e2a Token\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u957f\u6587\u672c\u751f\u6210&#xff08;Long-Context Generation&#xff09;\n<ul>\n<li>\u7279\u5f81&#xff1a;\u5199\u5c0f\u8bf4\u3001\u751f\u6210\u957f\u7bc7\u62a5\u544a&#xff08;\u8f93\u51fa &gt; 1024 tokens&#xff09;\u3002<\/li>\n<li>\u4ef7\u503c&#xff1a;\u5982\u679c\u6ca1\u6709 KV Cache&#xff0c;\u751f\u6210\u7b2c 1000 \u4e2a\u5b57\u7684\u901f\u5ea6\u4f1a\u6bd4\u7b2c 1 \u4e2a\u5b57\u6162\u51e0\u767e\u500d&#xff08;\u56e0\u4e3a\u8ba1\u7b97\u91cf\u5728\u7d2f\u52a0&#xff09;\u3002\u5f00\u542f\u540e&#xff0c;\u751f\u6210\u901f\u5ea6\u4fdd\u6301\u6052\u5b9a\u3002<\/li>\n<\/ul>\n<\/li>\n<li>\u9ad8\u5e76\u53d1\u6279\u91cf\u63a8\u7406&#xff08;High Throughput Serving&#xff09;\n<ul>\n<li>\u7279\u5f81&#xff1a;API \u670d\u52a1\u7aef&#xff0c;\u540c\u65f6\u670d\u52a1\u6570\u5341\u4e2a\u7528\u6237\u3002<\/li>\n<li>\u4ef7\u503c&#xff1a;\u914d\u5408 vLLM \u7684 PagedAttention&#xff0c;\u53ef\u4ee5\u5728\u6709\u9650\u7684\u663e\u5b58\u4e2d\u585e\u5165\u66f4\u591a\u7528\u6237\u7684 KV Cache&#xff0c;\u4ece\u800c\u663e\u8457\u63d0\u5347 Batch Size&#xff08;\u541e\u5410\u91cf&#xff09;\u3002<\/li>\n<\/ul>\n<\/li>\n<h4>3.3 \u4ee3\u7801\u5b9e\u73b0&#xff08;Qwen2.5-7B-Instruct&#xff09;<\/h4>\n<h5>3.3.1 \u57fa\u7840KV Cache\u914d\u7f6e&#xff08;vLLM&#xff09;<\/h5>\n<h6>vLLM \u63a8\u7406\u52a0\u901f\u6d41\u7a0b\u56fe&#xff08;\u57fa\u4e8e\u4ee3\u7801\u903b\u8f91&#xff09;<\/h6>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u7a0b\u5e8f\u5165\u53e3\u3011: <span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token builtin class-name\">:<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u5f15\u64ce\u521d\u59cb\u5316 <span class=\"token punctuation\">(<\/span>Engine Initialization<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u73af\u5883\u914d\u7f6e <span class=\"token punctuation\">(<\/span>Environment<span class=\"token punctuation\">)<\/span>                                   \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u6587\u4ef6<span class=\"token operator\">&gt;<\/span>: .env <span class=\"token punctuation\">(<\/span>dotenv<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8bfb\u53d6\u53d8\u91cf<span class=\"token operator\">&gt;<\/span>: MODEL_DIR <span class=\"token punctuation\">(<\/span>\u6a21\u578b\u8def\u5f84<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. vLLM \u5f15\u64ce\u6784\u5efa <span class=\"token punctuation\">(<\/span>LLM Class Instantiation<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u6a21\u578b<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;Qwen\/Qwen2.5-7B-Instruct&#034;<\/span>                 \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u663e\u5b58\u4e0e\u7f13\u5b58\u7ba1\u7406<span class=\"token operator\">&gt;<\/span>: PagedAttention <span class=\"token punctuation\">(<\/span>\u6838\u5fc3\u4f18\u5316<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 gpu_memory_utilization <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0.9<\/span> <span class=\"token punctuation\">(<\/span>\u9884\u755990%\u663e\u5b58\u7528\u4e8eKV<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 swap_space <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">4<\/span> <span class=\"token punctuation\">(<\/span>4GB CPU\u5185\u5b58\u4f5c\u4e3aSwap\u533a&#xff0c;\u9632OOM<span class=\"token punctuation\">)<\/span>       \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 max_cpu_cache_buffer <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2<\/span> <span class=\"token punctuation\">(<\/span>CPU\u7f13\u5b58\u7f13\u51b2<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u5ea6\u4e0e\u5e76\u884c\u7b56\u7565<span class=\"token operator\">&gt;<\/span>: Batching <span class=\"token operator\">&amp;<\/span> Parallelism               \u2502<br \/>\n\u2502       \u251c\u2500\u2500 tensor_parallel_size <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">1<\/span> <span class=\"token punctuation\">(<\/span>\u5355\u5361\u8fd0\u884c<span class=\"token punctuation\">)<\/span>                 \u2502<br \/>\n\u2502       \u251c\u2500\u2500 max_num_seqs <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">64<\/span> <span class=\"token punctuation\">(<\/span>\u6700\u5927\u5e76\u53d1\u5e8f\u5217\u6570<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502       \u251c\u2500\u2500 max_num_batched_tokens <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">4096<\/span> <span class=\"token punctuation\">(<\/span>\u6279\u5904\u7406\u5927\u5c0f<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502       \u2514\u2500\u2500 enable_chunked_prefill <span class=\"token operator\">&#061;<\/span> True <span class=\"token punctuation\">(<\/span>\u957fPrompt\u5206\u5757\u5904\u7406<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u5f15\u64ce\u5c31\u7eea: llm \u5bf9\u8c61 <span class=\"token punctuation\">(<\/span>KV Cache \u7ba1\u7406\u5668\u5df2\u542f\u52a8<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u63a8\u7406\u6267\u884c\u6d41\u7a0b <span class=\"token punctuation\">(<\/span>Inference Execution<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u91c7\u6837\u53c2\u6570\u8bbe\u5b9a <span class=\"token punctuation\">(<\/span>SamplingParams<span class=\"token punctuation\">)<\/span>                            \u2502<br \/>\n\u2502   \u251c\u2500\u2500 temperature <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0.1<\/span> <span class=\"token punctuation\">(<\/span>\u4f4e\u968f\u673a\u6027&#xff0c;\u9002\u5408\u6307\u4ee4\u9075\u5faa<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502   \u251c\u2500\u2500 max_tokens <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">2048<\/span> <span class=\"token punctuation\">(<\/span>\u9650\u5236\u8f93\u51fa\u957f\u5ea6<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502   \u2514\u2500\u2500 optimization: use_beam_search <span class=\"token operator\">&#061;<\/span> False <span class=\"token punctuation\">(<\/span>\u8d2a\u5a6a\u89e3\u7801\u66f4\u5feb<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u63d0\u793a\u8bcd\u5de5\u7a0b <span class=\"token punctuation\">(<\/span>Prompt Engineering<span class=\"token punctuation\">)<\/span>                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: prompts \u5217\u8868 <span class=\"token punctuation\">(<\/span>\u591a\u8f6e\u5bf9\u8bdd\u6d4b\u8bd5\u7528\u4f8b<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u683c\u5f0f\u5316<span class=\"token operator\">&gt;<\/span>: ChatML \u683c\u5f0f\u5c01\u88c5                              \u2502<br \/>\n\u2502       \u2514\u2500\u2500 f<span class=\"token string\">&#034;&lt;|im_start|&gt;user<span class=\"token entity\" title=\"\\\\n\">\\\\n<\/span>{prompt}&lt;|im_end|&gt;<span class=\"token entity\" title=\"\\\\n\">\\\\n<\/span>&#8230;&#034;<\/span>       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 C. \u751f\u6210\u8c03\u7528\u5faa\u73af <span class=\"token punctuation\">(<\/span>llm.generate<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u9636\u6bb5 <span class=\"token number\">1<\/span>: Prefill <span class=\"token punctuation\">(<\/span>\u9884\u586b\u5145 &#8211; \u8ba1\u7b97Prompt\u7684KV<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5206\u5757<span class=\"token operator\">&gt;<\/span>: \u82e5\u957f\u5ea6 <span class=\"token operator\">&gt;<\/span> <span class=\"token number\">512<\/span>, \u89e6\u53d1 Chunked Prefill          \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5b58\u50a8<span class=\"token operator\">&gt;<\/span>: \u5199\u5165\u663e\u5b58\u4e2d\u7684 Paged Block Tables             \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 \u9636\u6bb5 <span class=\"token number\">2<\/span>: Decode <span class=\"token punctuation\">(<\/span>\u89e3\u7801 &#8211; \u751f\u6210Response<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502       \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8bfb\u53d6<span class=\"token operator\">&gt;<\/span>: \u6839\u636e Block Table \u7d22\u5f15\u547d\u4e2d\u5386\u53f2 KV             \u2502<br \/>\n\u2502       \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8ba1\u7b97<span class=\"token operator\">&gt;<\/span>: \u4ec5\u8ba1\u7b97\u65b0 Token \u7684 Attention <span class=\"token punctuation\">(<\/span>O<span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span>\u590d\u6742\u5ea6<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8fed\u4ee3<span class=\"token operator\">&gt;<\/span>: \u76f4\u5230\u8f93\u51fa <span class=\"token operator\">&lt;<\/span><span class=\"token operator\">|<\/span>im_end<span class=\"token operator\">|<\/span><span class=\"token operator\">&gt;<\/span>                         \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa\u5bf9\u8c61: RequestOutput <span class=\"token punctuation\">(<\/span>\u5305\u542b generated_text<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u6027\u80fd\u5bf9\u6bd4\u6d4b\u8bd5 <span class=\"token punctuation\">(<\/span>Performance Benchmarking<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u573a\u666f <span class=\"token operator\"><span class=\"token file-descriptor important\">1<\/span>&gt;<\/span>: \u542f\u7528 KV Cache <span class=\"token punctuation\">(<\/span>test_kv_cache_effect<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u72b6\u6001: \u6b63\u5e38\u5f00\u542f PagedAttention                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u884c\u4e3a: \u968f\u7740\u5bf9\u8bdd\u8f6e\u6570\u589e\u52a0&#xff0c;\u63a8\u7406\u901f\u5ea6\u4fdd\u6301\u7a33\u5b9a                 \u2502<br \/>\n\u2502   \u2514\u2500\u2500 \u8f93\u51fa: \u6253\u5370\u6bcf\u8f6e\u8017\u65f6 <span class=\"token punctuation\">(<\/span>Time Delta<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u573a\u666f <span class=\"token operator\"><span class=\"token file-descriptor important\">2<\/span>&gt;<\/span>: \u7981\u7528 KV Cache <span class=\"token punctuation\">(<\/span>test_no_kv_cache<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u914d\u7f6e: max_num_batched_tokens <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">1<\/span> <span class=\"token punctuation\">(<\/span>\u5f3a\u5236\u4e32\u884c\/\u65e0\u6279\u5904\u7406<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u884c\u4e3a: \u6bcf\u751f\u6210\u4e00\u4e2a\u8bcd\u90fd\u91cd\u7b97\u6240\u6709\u5386\u53f2&#xff0c;\u901f\u5ea6\u6781\u6162               \u2502<br \/>\n\u2502   \u2514\u2500\u2500 \u8f93\u51fa: \u6253\u5370\u9ad8\u5ef6\u8fdf\u7ed3\u679c&#xff0c;\u5f62\u6210\u9c9c\u660e\u5bf9\u6bd4                       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u6700\u7ec8\u62a5\u544a: Console Print <span class=\"token punctuation\">(<\/span>Total Time <span class=\"token operator\">&amp;<\/span> Speed<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<p>import os<br \/>\nimport time<br \/>\nfrom vllm import LLM, SamplingParams<br \/>\nfrom dotenv import load_dotenv<\/p>\n<p># \u52a0\u8f7d\u73af\u5883\u53d8\u91cf&#xff08;\u53ef\u9009&#xff1a;\u914d\u7f6e\u6a21\u578b\u4e0b\u8f7d\u8def\u5f84&#xff09;<br \/>\nload_dotenv()<\/p>\n<p># 1. \u914d\u7f6evLLM&#xff08;\u542f\u7528KV Cache&#xff0c;\u9ed8\u8ba4\u5f00\u542fPagedAttention&#xff09;<br \/>\nllm &#061; LLM(<br \/>\n    model&#061;&#034;Qwen\/Qwen2.5-7B-Instruct&#034;,  # \u652f\u6301\u672c\u5730\u8def\u5f84\/ModelScope\/HuggingFace<br \/>\n    model_dir&#061;os.getenv(&#034;MODEL_DIR&#034;, &#034;.\/models\/Qwen2.5-7B-Instruct&#034;),<br \/>\n    tensor_parallel_size&#061;1,  # \u5355GPU<br \/>\n    gpu_memory_utilization&#061;0.9,  # \u663e\u5b58\u5229\u7528\u7387<br \/>\n    max_num_batched_tokens&#061;4096,  # \u6279\u91cftoken\u6570<br \/>\n    max_num_seqs&#061;64,  # \u6700\u5927\u5e76\u53d1\u5e8f\u5217\u6570<br \/>\n    # KV Cache\u6838\u5fc3\u914d\u7f6e<br \/>\n    swap_space&#061;4,  # \u78c1\u76d8\u4ea4\u6362\u7a7a\u95f4&#xff08;GB&#xff09;&#xff0c;\u663e\u5b58\u4e0d\u8db3\u65f6\u542f\u7528<br \/>\n    max_cpu_cache_buffer&#061;2,  # CPU\u7f13\u5b58KV\u7684\u7f13\u51b2\u533a&#xff08;GB&#xff09;<br \/>\n    enable_chunked_prefill&#061;True,  # \u5206\u5757\u9884\u586b\u5145KV Cache<br \/>\n    chunked_prefill_tokens&#061;512,  # \u9884\u586b\u5145chunk\u5927\u5c0f<br \/>\n)<\/p>\n<p># 2. \u91c7\u6837\u53c2\u6570\u914d\u7f6e<br \/>\nsampling_params &#061; SamplingParams(<br \/>\n    temperature&#061;0.1,<br \/>\n    max_tokens&#061;2048,  # \u6700\u5927\u751f\u6210\u957f\u5ea6<br \/>\n    top_p&#061;0.95,<br \/>\n    # KV Cache\u76f8\u5173<br \/>\n    use_beam_search&#061;False,<br \/>\n    ignore_eos&#061;False,<br \/>\n)<\/p>\n<p># 3. \u6d4b\u8bd5KV Cache\u6548\u679c<br \/>\ndef test_kv_cache_effect():<br \/>\n    # \u6d4b\u8bd5\u7528\u4f8b&#xff1a;\u957f\u4e0a\u4e0b\u6587\u591a\u8f6e\u5bf9\u8bdd<br \/>\n    prompts &#061; [<br \/>\n        &#034;\u8bf7\u8be6\u7ec6\u4ecb\u7ecd\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6280\u672fKV Cache\u7684\u5de5\u4f5c\u539f\u7406&#xff0c;\u8981\u6c42\u5206\u70b9\u8bf4\u660e&#xff0c;\u5b57\u6570\u4e0d\u5c11\u4e8e1000\u5b57&#034;,<br \/>\n        &#034;\u57fa\u4e8e\u4f60\u521a\u624d\u7684\u56de\u7b54&#xff0c;\u8fdb\u4e00\u6b65\u89e3\u91caPagedAttention\u76f8\u6bd4\u4f20\u7edfKV Cache\u7684\u4f18\u52bf&#034;,<br \/>\n        &#034;\u603b\u7ed3\u4e0a\u8ff0\u5185\u5bb9&#xff0c;\u7ed9\u51faKV Cache\u5728\u5b9e\u9645\u90e8\u7f72\u4e2d\u7684\u6700\u4f73\u5b9e\u8df5&#034;<br \/>\n    ]<\/p>\n<p>    # \u8bb0\u5f55\u6bcf\u8f6e\u8017\u65f6<br \/>\n    total_time &#061; 0<br \/>\n    for i, prompt in enumerate(prompts):<br \/>\n        start_time &#061; time.time()<br \/>\n        # \u6784\u9020\u5bf9\u8bdd\u683c\u5f0f&#xff08;Qwen2.5\u8981\u6c42&#xff09;<br \/>\n        formatted_prompt &#061; f&#034;&lt;|im_start|&gt;user\\\\n{prompt}&lt;|im_end|&gt;\\\\n&lt;|im_start|&gt;assistant\\\\n&#034;<br \/>\n        outputs &#061; llm.generate([formatted_prompt], sampling_params)<br \/>\n        end_time &#061; time.time()<\/p>\n<p>        # \u8f93\u51fa\u7ed3\u679c<br \/>\n        output &#061; outputs[0].outputs[0].text<br \/>\n        elapsed_time &#061; end_time &#8211; start_time<br \/>\n        total_time &#043;&#061; elapsed_time<\/p>\n<p>        print(f&#034;\\\\n&#061;&#061;&#061; \u7b2c{i&#043;1}\u8f6e\u5bf9\u8bdd &#061;&#061;&#061;&#034;)<br \/>\n        print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n        print(f&#034;\u751f\u6210\u957f\u5ea6&#xff1a;{len(output)}\u5b57\u7b26&#034;)<br \/>\n        print(f&#034;\u5185\u5bb9&#xff1a;{output[:200]}&#8230;&#034;)<\/p>\n<p>    print(f&#034;\\\\n\u603b\u8017\u65f6&#xff1a;{total_time:.2f}\u79d2&#xff0c;\u5e73\u5747\u6bcf\u8f6e&#xff1a;{total_time\/len(prompts):.2f}\u79d2&#034;)<\/p>\n<p># 4. \u5bf9\u6bd4\u65e0KV Cache&#xff08;\u7981\u7528PagedAttention&#xff09;<br \/>\ndef test_no_kv_cache():<br \/>\n    # \u7981\u7528KV Cache&#xff08;\u901a\u8fc7\u8bbe\u7f6emax_num_batched_tokens&#061;1&#xff09;<br \/>\n    llm_no_kv &#061; LLM(<br \/>\n        model&#061;&#034;Qwen\/Qwen2.5-7B-Instruct&#034;,<br \/>\n        model_dir&#061;os.getenv(&#034;MODEL_DIR&#034;, &#034;.\/models\/Qwen2.5-7B-Instruct&#034;),<br \/>\n        tensor_parallel_size&#061;1,<br \/>\n        gpu_memory_utilization&#061;0.9,<br \/>\n        max_num_batched_tokens&#061;1,  # \u7981\u7528\u6279\u91cf\u5904\u7406&#xff0c;\u7b49\u4ef7\u4e8e\u7981\u7528KV Cache<br \/>\n        enable_chunked_prefill&#061;False,<br \/>\n    )<\/p>\n<p>    prompt &#061; &#034;\u8bf7\u8be6\u7ec6\u4ecb\u7ecd\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6280\u672fKV Cache\u7684\u5de5\u4f5c\u539f\u7406&#xff0c;\u8981\u6c42\u5206\u70b9\u8bf4\u660e&#xff0c;\u5b57\u6570\u4e0d\u5c11\u4e8e1000\u5b57&#034;<br \/>\n    formatted_prompt &#061; f&#034;&lt;|im_start|&gt;user\\\\n{prompt}&lt;|im_end|&gt;\\\\n&lt;|im_start|&gt;assistant\\\\n&#034;<\/p>\n<p>    start_time &#061; time.time()<br \/>\n    outputs &#061; llm_no_kv.generate([formatted_prompt], sampling_params)<br \/>\n    end_time &#061; time.time()<\/p>\n<p>    output &#061; outputs[0].outputs[0].text<br \/>\n    elapsed_time &#061; end_time &#8211; start_time<\/p>\n<p>    print(f&#034;\\\\n&#061;&#061;&#061; \u65e0KV Cache\u6d4b\u8bd5 &#061;&#061;&#061;&#034;)<br \/>\n    print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n    print(f&#034;\u751f\u6210\u957f\u5ea6&#xff1a;{len(output)}\u5b57\u7b26&#034;)<\/p>\n<p>if __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    print(&#034;&#061;&#061;&#061; \u6d4b\u8bd5KV Cache\u6548\u679c &#061;&#061;&#061;&#034;)<br \/>\n    test_kv_cache_effect()<br \/>\n    print(&#034;\\\\n&#061;&#061;&#061; \u6d4b\u8bd5\u65e0KV Cache\u6548\u679c &#061;&#061;&#061;&#034;)<br \/>\n    test_no_kv_cache()<\/p>\n<h5>3.3.2 \u6548\u679c\u9a8c\u8bc1\u8f93\u51fa\u793a\u4f8b<\/h5>\n<p>&#061;&#061;&#061; \u6d4b\u8bd5KV Cache\u6548\u679c &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u7b2c1\u8f6e\u5bf9\u8bdd &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;8.24\u79d2<br \/>\n\u751f\u6210\u957f\u5ea6&#xff1a;1256\u5b57\u7b26<br \/>\n\u5185\u5bb9&#xff1a;KV Cache&#xff08;Key-Value Cache&#xff09;\u662f\u5927\u6a21\u578b\u63a8\u7406\u9636\u6bb5\u7684\u6838\u5fc3\u52a0\u901f\u6280\u672f&#xff0c;\u4e3b\u8981\u9488\u5bf9Transformer\u67b6\u6784\u7684\u81ea\u6ce8\u610f\u529b\u673a\u5236\u4f18\u5316&#8230;<\/p>\n<p>&#061;&#061;&#061; \u7b2c2\u8f6e\u5bf9\u8bdd &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;3.12\u79d2<br \/>\n\u751f\u6210\u957f\u5ea6&#xff1a;898\u5b57\u7b26<br \/>\n\u5185\u5bb9&#xff1a;PagedAttention\u662fvLLM\u63d0\u51fa\u7684KV Cache\u4f18\u5316\u65b9\u6848&#xff0c;\u89e3\u51b3\u4e86\u4f20\u7edfKV Cache\u7684\u4e24\u5927\u95ee\u9898&#xff1a;1. \u663e\u5b58\u788e\u7247&#8230;<\/p>\n<p>&#061;&#061;&#061; \u7b2c3\u8f6e\u5bf9\u8bdd &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;2.87\u79d2<br \/>\n\u751f\u6210\u957f\u5ea6&#xff1a;756\u5b57\u7b26<br \/>\n\u5185\u5bb9&#xff1a;KV Cache\u5728\u5b9e\u9645\u90e8\u7f72\u4e2d\u7684\u6700\u4f73\u5b9e\u8df5\u5305\u62ec&#xff1a;1. \u6839\u636eGPU\u663e\u5b58\u8c03\u6574chunked_prefill_tokens&#8230;<\/p>\n<p>\u603b\u8017\u65f6&#xff1a;14.23\u79d2&#xff0c;\u5e73\u5747\u6bcf\u8f6e&#xff1a;4.74\u79d2<\/p>\n<p>&#061;&#061;&#061; \u6d4b\u8bd5\u65e0KV Cache\u6548\u679c &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u65e0KV Cache\u6d4b\u8bd5 &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;22.58\u79d2<br \/>\n\u751f\u6210\u957f\u5ea6&#xff1a;1248\u5b57\u7b26<\/p>\n<h4>3.4 KV Cache \u8c03\u4f18\u6307\u5357<\/h4>\n<table>\n<tr>\u53c2\u6570\u4f5c\u7528\u63a8\u8350\u503c&#xff08;7B\u6a21\u578b&#xff09;\u8c03\u4f18\u539f\u5219<\/tr>\n<tbody>\n<tr>\n<td>max_num_seqs<\/td>\n<td>\u6700\u5927\u5e76\u53d1\u5e8f\u5217\u6570<\/td>\n<td>32-64<\/td>\n<td>\u663e\u5b58\u5145\u8db3\u5219\u589e\u5927<\/td>\n<\/tr>\n<tr>\n<td>chunked_prefill_tokens<\/td>\n<td>\u9884\u586b\u5145chunk\u5927\u5c0f<\/td>\n<td>512-1024<\/td>\n<td>\u957f\u6587\u672c\u2192\u589e\u5927&#xff0c;\u77ed\u6587\u672c\u2192\u51cf\u5c0f<\/td>\n<\/tr>\n<tr>\n<td>gpu_memory_utilization<\/td>\n<td>\u663e\u5b58\u5229\u7528\u7387<\/td>\n<td>0.8-0.9<\/td>\n<td>\u907f\u514dOOM\u5219\u964d\u4f4e\u81f30.7<\/td>\n<\/tr>\n<tr>\n<td>swap_space<\/td>\n<td>\u78c1\u76d8\u4ea4\u6362\u7a7a\u95f4<\/td>\n<td>2-8GB<\/td>\n<td>\u663e\u5b58\u4e0d\u8db3\u65f6\u542f\u7528<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>\u56db\u3001\u91cf\u5316\u6280\u672f\u5b9e\u6218&#xff08;INT4\/INT8\/GPTQ&#xff09;<\/h3>\n<h4>4.1 \u91cf\u5316\u6280\u672f\u5206\u7c7b\u4e0e\u9009\u578b<\/h4>\n<table>\n<tr>\u91cf\u5316\u7c7b\u578b\u5de5\u5177\u7cbe\u5ea6\u901f\u5ea6\u63d0\u5347\u663e\u5b58\u8282\u7701\u9002\u7528\u6a21\u578b<\/tr>\n<tbody>\n<tr>\n<td>\u52a8\u6001INT8<\/td>\n<td>bitsandbytes<\/td>\n<td>INT8<\/td>\n<td>1.5\u500d<\/td>\n<td>50%<\/td>\n<td>\u6240\u6709\u5f00\u6e90\u6a21\u578b<\/td>\n<\/tr>\n<tr>\n<td>\u9759\u6001GPTQ<\/td>\n<td>AutoGPTQ<\/td>\n<td>INT4\/INT8<\/td>\n<td>2-3\u500d<\/td>\n<td>60-70%<\/td>\n<td>\u652f\u6301GPTQ\u91cf\u5316\u7684\u6a21\u578b<\/td>\n<\/tr>\n<tr>\n<td>AWQ\u91cf\u5316<\/td>\n<td>awq<\/td>\n<td>INT4\/INT8<\/td>\n<td>2.5-3\u500d<\/td>\n<td>65-75%<\/td>\n<td>Llama3\/Qwen2.5<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>4.2 \u52a8\u6001INT8\u91cf\u5316&#xff08;Transformers &#043; bitsandbytes&#xff09;<\/h4>\n<h5>4.2.1 \u4ee3\u7801\u5b9e\u73b0<\/h5>\n<h6>\u4ee3\u7801\u5b9e\u73b0\u6d41\u7a0b\u56fe (INT8 Quantization Flow)<\/h6>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u7a0b\u5e8f\u5165\u53e3\u3011: <span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token builtin class-name\">:<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u73af\u5883\u4e0e\u914d\u7f6e\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Configuration <span class=\"token operator\">&amp;<\/span> Setup<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u73af\u5883\u521d\u59cb\u5316                                               \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u6587\u4ef6<span class=\"token operator\">&gt;<\/span>: .env <span class=\"token punctuation\">(<\/span>\u8bfb\u53d6 MODEL_DIR<span class=\"token punctuation\">)<\/span>                       \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u68c0\u6d4b\u786c\u4ef6<span class=\"token operator\">&gt;<\/span>: torch.cuda.is_available<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>                   \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u91cf\u5316\u914d\u7f6e\u6784\u5efa <span class=\"token punctuation\">(<\/span>BitsAndBytesConfig<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u6838\u5fc3\u6b65\u9aa4<span class=\"token operator\">&gt;<\/span>            \u2502<br \/>\n\u2502   \u251c\u2500\u2500 load_in_8bit <span class=\"token operator\">&#061;<\/span> True <span class=\"token punctuation\">(<\/span>\u6fc0\u6d3b INT8 \u6df7\u5408\u7cbe\u5ea6<span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502   \u251c\u2500\u2500 llm_int8_threshold <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">6.0<\/span> <span class=\"token punctuation\">(<\/span>\u79bb\u7fa4\u503c\u4fdd\u62a4<span class=\"token punctuation\">)<\/span>                   \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u4f5c\u7528: \u8d85\u8fc7 <span class=\"token number\">6.0<\/span> \u7684\u6fc0\u6d3b\u503c\u4fdd\u7559 FP16&#xff0c;\u9632\u6b62\u7cbe\u5ea6\u5d29\u584c      \u2502<br \/>\n\u2502   \u2514\u2500\u2500 llm_int8_skip_modules <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;lm_head&#034;<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token punctuation\">(<\/span>\u8f93\u51fa\u5c42\u4fdd\u6301\u9ad8\u7cbe\u5ea6<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u914d\u7f6e\u5bf9\u8c61: bnb_config \u51c6\u5907\u5c31\u7eea \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u6a21\u578b\u52a0\u8f7d\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Quantized Model Loading<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. Tokenizer \u52a0\u8f7d                                          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528<span class=\"token operator\">&gt;<\/span>: AutoTokenizer.from_pretrained                  \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u6a21\u578b\u52a8\u6001\u91cf\u5316\u52a0\u8f7d <span class=\"token punctuation\">(<\/span>On-the-fly Quantization<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: \u539f\u59cb FP16 \u6743\u91cd\u6587\u4ef6 <span class=\"token punctuation\">(<\/span>.bin\/.safetensors<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5904\u7406<span class=\"token operator\">&gt;<\/span>: AutoModelForCausalLM.from_pretrained           \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u6ce8\u5165 <span class=\"token assign-left variable\">quantization_config<\/span><span class=\"token operator\">&#061;<\/span>bnb_config                \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 \u6620\u5c04 <span class=\"token assign-left variable\">device_map<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;auto&#034;<\/span> <span class=\"token punctuation\">(<\/span>\u81ea\u52a8\u5206\u914d GPU\/CPU<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: \u663e\u5b58\u4e2d\u9a7b\u7559\u7684\u662f <span class=\"token punctuation\">[<\/span>INT8 \u6743\u91cd<span class=\"token punctuation\">]<\/span> &#043; <span class=\"token punctuation\">[<\/span>\u5c11\u91cf FP16 \u6a21\u5757<span class=\"token punctuation\">]<\/span>    \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u72b6\u6001\u68c0\u67e5: model.is_quantized <span class=\"token punctuation\">(<\/span>\u5e94\u4e3a True<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u63a8\u7406\u6267\u884c\u5faa\u73af <span class=\"token punctuation\">(<\/span>Inference Loop: test_int8_inference<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: Prompts \u5217\u8868 <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;\u89e3\u91ca\u91cf\u5316&#8230;&#034;<\/span>, <span class=\"token string\">&#034;\u5982\u4f55\u5e73\u8861&#8230;&#034;<\/span><span class=\"token punctuation\">]<\/span>           \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 \u21bb \u904d\u5386\u6bcf\u4e2a Prompt                                           \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 A. \u9884\u5904\u7406 <span class=\"token punctuation\">(<\/span>Preprocessing<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6a21\u7248<span class=\"token operator\">&gt;<\/span>: apply_chat_template <span class=\"token punctuation\">(<\/span>User\/Assistant \u683c\u5f0f<span class=\"token punctuation\">)<\/span>  \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u5f20\u91cf: input_ids <span class=\"token punctuation\">(<\/span>\u79fb\u52a8\u5230 CUDA \u8bbe\u5907<span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 B. \u751f\u6210\u8fc7\u7a0b <span class=\"token punctuation\">(<\/span>Generation<span class=\"token punctuation\">)<\/span>                               \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528<span class=\"token operator\">&gt;<\/span>: model.generate                             \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u53c2\u6570\u63a7\u5236:                                          \u2502<br \/>\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 <span class=\"token assign-left variable\">max_new_tokens<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">512<\/span> <span class=\"token punctuation\">(<\/span>\u751f\u6210\u957f\u5ea6<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 <span class=\"token assign-left variable\">use_cache<\/span><span class=\"token operator\">&#061;<\/span>True <span class=\"token punctuation\">(<\/span>\u5f00\u542f KV Cache \u52a0\u901f<span class=\"token punctuation\">)<\/span>            \u2502<br \/>\n\u2502   \u2502   \u2502   \u2514\u2500\u2500 <span class=\"token assign-left variable\">do_sample<\/span><span class=\"token operator\">&#061;<\/span>True <span class=\"token punctuation\">(<\/span>\u542f\u7528\u968f\u673a\u91c7\u6837<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8ba1\u7b97: <span class=\"token punctuation\">[<\/span>INT8 \u77e9\u9635\u4e58\u6cd5<span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&lt;<\/span>&#8211;<span class=\"token operator\">&gt;<\/span> <span class=\"token punctuation\">[<\/span>FP16 \u6fc0\u6d3b\u503c<span class=\"token punctuation\">]<\/span> <span class=\"token punctuation\">(<\/span>\u6df7\u5408\u8ba1\u7b97<span class=\"token punctuation\">)<\/span> \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 C. \u540e\u5904\u7406 <span class=\"token punctuation\">(<\/span>Post-processing<span class=\"token punctuation\">)<\/span>                            \u2502<br \/>\n\u2502       \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u89e3\u7801<span class=\"token operator\">&gt;<\/span>: tokenizer.decode <span class=\"token punctuation\">(<\/span>Token ID &#8211;<span class=\"token operator\">&gt;<\/span> \u6587\u672c<span class=\"token punctuation\">)<\/span>        \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u622a\u65ad<span class=\"token operator\">&gt;<\/span>: remove <span class=\"token string\">&#034;&lt;|im_end|&gt;&#034;<\/span>                        \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: \u6253\u5370\u751f\u6210\u5185\u5bb9\u3001\u8017\u65f6\u7edf\u8ba1 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">4<\/span>. \u6027\u80fd\u76d1\u63a7\u4e0e\u5bf9\u6bd4 <span class=\"token punctuation\">(<\/span>Metrics <span class=\"token operator\">&amp;<\/span> Benchmarking<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u901f\u5ea6\u6307\u6807 <span class=\"token punctuation\">(<\/span>Speed Metrics<span class=\"token punctuation\">)<\/span>                                 \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u8ba1\u7b97: Generate Tokens \/ Elapsed Time                   \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: xx.xx tokens\/\u79d2 <span class=\"token punctuation\">(<\/span>\u901a\u5e38\u6bd4 FP16 \u7565\u6162&#xff0c;\u4f46\u7701\u663e\u5b58<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u663e\u5b58\u6307\u6807 <span class=\"token punctuation\">(<\/span>Memory Metrics<span class=\"token punctuation\">)<\/span>                                \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u76d1\u63a7: torch.cuda.max_memory_allocated<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: INT8 \u6a21\u5f0f\u4e0b\u663e\u5b58\u5360\u7528\u7ea6\u4e3a FP16 \u7684 <span class=\"token number\">50<\/span>%~60%         \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 C. \u5bf9\u7167\u7ec4 <span class=\"token punctuation\">(<\/span>test_fp16_inference<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n    \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8fd0\u884c\u76f8\u540c\u903b\u8f91&#xff0c;\u4f46\u4e0d\u52a0\u8f7d quantization_config \u4ee5\u4f5c\u5bf9\u6bd4     \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<p>import torch<br \/>\nimport time<br \/>\nfrom transformers import (<br \/>\n    AutoModelForCausalLM,<br \/>\n    AutoTokenizer,<br \/>\n    BitsAndBytesConfig<br \/>\n)<br \/>\nfrom dotenv import load_dotenv<br \/>\nimport os<\/p>\n<p>load_dotenv()<\/p>\n<p># 1. \u914d\u7f6eINT8\u91cf\u5316<br \/>\nbnb_config &#061; BitsAndBytesConfig(<br \/>\n    load_in_8bit&#061;True,  # \u542f\u7528INT8\u91cf\u5316<br \/>\n    bnb_4bit_use_double_quant&#061;False,<br \/>\n    bnb_4bit_quant_type&#061;&#034;nf4&#034;,<br \/>\n    bnb_4bit_compute_dtype&#061;torch.float16,<br \/>\n    # KV Cache\u91cf\u5316<br \/>\n    llm_int8_enable_fp32_cpu_offload&#061;False,<br \/>\n    llm_int8_skip_modules&#061;[&#034;lm_head&#034;],  # \u8f93\u51fa\u5c42\u4e0d\u91cf\u5316<br \/>\n    llm_int8_threshold&#061;6.0,<br \/>\n)<\/p>\n<p># 2. \u52a0\u8f7d\u6a21\u578b\u548cTokenizer<br \/>\nmodel_name &#061; &#034;Qwen\/Qwen2.5-7B-Instruct&#034;<br \/>\nmodel_dir &#061; os.getenv(&#034;MODEL_DIR&#034;, &#034;.\/models\/Qwen2.5-7B-Instruct&#034;)<\/p>\n<p>tokenizer &#061; AutoTokenizer.from_pretrained(<br \/>\n    model_dir if os.path.exists(model_dir) else model_name,<br \/>\n    trust_remote_code&#061;True,<br \/>\n    use_fast&#061;False<br \/>\n)<\/p>\n<p># \u52a0\u8f7d\u91cf\u5316\u6a21\u578b<br \/>\nstart_load &#061; time.time()<br \/>\nmodel &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n    model_dir if os.path.exists(model_dir) else model_name,<br \/>\n    quantization_config&#061;bnb_config,<br \/>\n    device_map&#061;&#034;auto&#034;,  # \u81ea\u52a8\u5206\u914d\u8bbe\u5907<br \/>\n    trust_remote_code&#061;True,<br \/>\n    torch_dtype&#061;torch.float16,<br \/>\n    low_cpu_mem_usage&#061;True<br \/>\n)<br \/>\nend_load &#061; time.time()<\/p>\n<p>print(f&#034;\u6a21\u578b\u52a0\u8f7d\u8017\u65f6&#xff1a;{end_load &#8211; start_load:.2f}\u79d2&#034;)<br \/>\nprint(f&#034;\u6a21\u578b\u8bbe\u5907&#xff1a;{model.device}&#034;)<br \/>\nprint(f&#034;\u662f\u5426\u91cf\u5316&#xff1a;{model.is_quantized}&#034;)<\/p>\n<p># 3. \u6d4b\u8bd5\u63a8\u7406\u901f\u5ea6\u548c\u663e\u5b58<br \/>\ndef test_int8_inference():<br \/>\n    # \u6d4b\u8bd5\u7528\u4f8b<br \/>\n    prompts &#061; [<br \/>\n        &#034;\u89e3\u91ca\u5927\u6a21\u578b\u91cf\u5316\u6280\u672f\u7684\u539f\u7406\u548c\u4f18\u7f3a\u70b9&#034;,<br \/>\n        &#034;\u5982\u4f55\u5e73\u8861\u91cf\u5316\u540e\u7684\u7cbe\u5ea6\u635f\u5931\u548c\u901f\u5ea6\u63d0\u5347&#034;,<br \/>\n        &#034;INT8\u91cf\u5316\u76f8\u6bd4INT4\u91cf\u5316\u7684\u4f18\u52bf&#034;<br \/>\n    ]<\/p>\n<p>    # \u5f00\u542fKV Cache&#xff08;\u9ed8\u8ba4\u5f00\u542f&#xff09;<br \/>\n    model.config.use_cache &#061; True<\/p>\n<p>    total_time &#061; 0<br \/>\n    total_tokens &#061; 0<\/p>\n<p>    for prompt in prompts:<br \/>\n        # \u6784\u9020\u8f93\u5165<br \/>\n        messages &#061; [<br \/>\n            {&#034;role&#034;: &#034;user&#034;, &#034;content&#034;: prompt}<br \/>\n        ]<br \/>\n        input_ids &#061; tokenizer.apply_chat_template(<br \/>\n            messages,<br \/>\n            tokenize&#061;True,<br \/>\n            add_generation_prompt&#061;True,<br \/>\n            return_tensors&#061;&#034;pt&#034;<br \/>\n        ).to(model.device)<\/p>\n<p>        # \u63a8\u7406<br \/>\n        start_time &#061; time.time()<br \/>\n        outputs &#061; model.generate(<br \/>\n            input_ids&#061;input_ids,<br \/>\n            max_new_tokens&#061;512,<br \/>\n            temperature&#061;0.1,<br \/>\n            top_p&#061;0.95,<br \/>\n            do_sample&#061;True,<br \/>\n            use_cache&#061;True,  # \u663e\u5f0f\u542f\u7528KV Cache<br \/>\n        )<br \/>\n        end_time &#061; time.time()<\/p>\n<p>        # \u89e3\u6790\u8f93\u51fa<br \/>\n        output_text &#061; tokenizer.decode(outputs[0], skip_special_tokens&#061;True)<br \/>\n        output_text &#061; output_text.split(&#034;&lt;|im_end|&gt;&#034;)[-2].strip()<\/p>\n<p>        # \u7edf\u8ba1<br \/>\n        elapsed_time &#061; end_time &#8211; start_time<br \/>\n        num_tokens &#061; len(outputs[0]) &#8211; len(input_ids[0])<br \/>\n        total_time &#043;&#061; elapsed_time<br \/>\n        total_tokens &#043;&#061; num_tokens<\/p>\n<p>        print(f&#034;\\\\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;{prompt[:50]}&#8230; &#061;&#061;&#061;&#034;)<br \/>\n        print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n        print(f&#034;\u751f\u6210Token\u6570&#xff1a;{num_tokens}&#034;)<br \/>\n        print(f&#034;\u901f\u5ea6&#xff1a;{num_tokens\/elapsed_time:.2f} tokens\/\u79d2&#034;)<br \/>\n        print(f&#034;\u8f93\u51fa&#xff1a;{output_text[:200]}&#8230;&#034;)<\/p>\n<p>    # \u6c47\u603b<br \/>\n    avg_speed &#061; total_tokens \/ total_time<br \/>\n    print(f&#034;\\\\n\u5e73\u5747\u901f\u5ea6&#xff1a;{avg_speed:.2f} tokens\/\u79d2&#034;)<\/p>\n<p>    # \u663e\u5b58\u4f7f\u7528<br \/>\n    if torch.cuda.is_available():<br \/>\n        mem_used &#061; torch.cuda.max_memory_allocated() \/ 1024 \/ 1024 \/ 1024<br \/>\n        print(f&#034;\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;{mem_used:.2f} GB&#034;)<\/p>\n<p># 4. \u5bf9\u6bd4\u975e\u91cf\u5316\u7248\u672c<br \/>\ndef test_fp16_inference():<br \/>\n    # \u52a0\u8f7dFP16\u6a21\u578b&#xff08;\u975e\u91cf\u5316&#xff09;<br \/>\n    model_fp16 &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n        model_dir if os.path.exists(model_dir) else model_name,<br \/>\n        device_map&#061;&#034;auto&#034;,<br \/>\n        trust_remote_code&#061;True,<br \/>\n        torch_dtype&#061;torch.float16,<br \/>\n        low_cpu_mem_usage&#061;True<br \/>\n    )<br \/>\n    model_fp16.config.use_cache &#061; True<\/p>\n<p>    prompt &#061; &#034;\u89e3\u91ca\u5927\u6a21\u578b\u91cf\u5316\u6280\u672f\u7684\u539f\u7406\u548c\u4f18\u7f3a\u70b9&#034;<br \/>\n    messages &#061; [{&#034;role&#034;: &#034;user&#034;, &#034;content&#034;: prompt}]<br \/>\n    input_ids &#061; tokenizer.apply_chat_template(<br \/>\n        messages,<br \/>\n        tokenize&#061;True,<br \/>\n        add_generation_prompt&#061;True,<br \/>\n        return_tensors&#061;&#034;pt&#034;<br \/>\n    ).to(model_fp16.device)<\/p>\n<p>    start_time &#061; time.time()<br \/>\n    outputs &#061; model_fp16.generate(<br \/>\n        input_ids&#061;input_ids,<br \/>\n        max_new_tokens&#061;512,<br \/>\n        temperature&#061;0.1,<br \/>\n        top_p&#061;0.95,<br \/>\n        do_sample&#061;True,<br \/>\n    )<br \/>\n    end_time &#061; time.time()<\/p>\n<p>    elapsed_time &#061; end_time &#8211; start_time<br \/>\n    num_tokens &#061; len(outputs[0]) &#8211; len(input_ids[0])<br \/>\n    speed &#061; num_tokens \/ elapsed_time<\/p>\n<p>    print(f&#034;\\\\n&#061;&#061;&#061; FP16\u975e\u91cf\u5316\u6d4b\u8bd5 &#061;&#061;&#061;&#034;)<br \/>\n    print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n    print(f&#034;\u901f\u5ea6&#xff1a;{speed:.2f} tokens\/\u79d2&#034;)<\/p>\n<p>    if torch.cuda.is_available():<br \/>\n        mem_used &#061; torch.cuda.max_memory_allocated() \/ 1024 \/ 1024 \/ 1024<br \/>\n        print(f&#034;\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;{mem_used:.2f} GB&#034;)<\/p>\n<p>if __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    print(&#034;&#061;&#061;&#061; INT8\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;&#034;)<br \/>\n    test_int8_inference()<br \/>\n    print(&#034;\\\\n&#061;&#061;&#061; FP16\u975e\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;&#034;)<br \/>\n    test_fp16_inference()<\/p>\n<h5>4.2.2 \u6548\u679c\u9a8c\u8bc1\u8f93\u51fa\u793a\u4f8b<\/h5>\n<p>\u6a21\u578b\u52a0\u8f7d\u8017\u65f6&#xff1a;45.68\u79d2<br \/>\n\u6a21\u578b\u8bbe\u5907&#xff1a;cuda:0<br \/>\n\u662f\u5426\u91cf\u5316&#xff1a;True<\/p>\n<p>&#061;&#061;&#061; INT8\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u89e3\u91ca\u5927\u6a21\u578b\u91cf\u5316\u6280\u672f\u7684\u539f\u7406\u548c\u4f18\u7f3a\u70b9&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;4.28\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;325<br \/>\n\u901f\u5ea6&#xff1a;75.93 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;\u5927\u6a21\u578b\u91cf\u5316\u6280\u672f\u662f\u5c06\u6a21\u578b\u6743\u91cd\u4ece\u9ad8\u7cbe\u5ea6&#xff08;\u5982FP16\/FP32&#xff09;\u8f6c\u6362\u4e3a\u4f4e\u7cbe\u5ea6&#xff08;\u5982INT8\/INT4&#xff09;\u7684\u4f18\u5316\u624b\u6bb5&#xff0c;\u6838\u5fc3\u539f\u7406\u5305\u62ec&#xff1a;1. \u6743\u91cd\u91cf\u5316&#xff1a;\u5c06\u6d6e\u70b9\u6570\u6743\u91cd\u6620\u5c04\u5230\u6574\u6570\u533a\u95f4&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u5982\u4f55\u5e73\u8861\u91cf\u5316\u540e\u7684\u7cbe\u5ea6\u635f\u5931\u548c\u901f\u5ea6\u63d0\u5347&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;3.87\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;289<br \/>\n\u901f\u5ea6&#xff1a;74.68 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;\u5e73\u8861\u91cf\u5316\u7cbe\u5ea6\u635f\u5931\u548c\u901f\u5ea6\u63d0\u5347\u9700\u4ece\u4ee5\u4e0b\u7ef4\u5ea6\u5165\u624b&#xff1a;1. \u91cf\u5316\u7c7b\u578b\u9009\u62e9&#xff1a;\u52a8\u6001\u91cf\u5316&#xff08;\u5982INT8&#xff09;\u7cbe\u5ea6\u635f\u5931\u5c0f&#xff0c;\u9759\u6001\u91cf\u5316&#xff08;\u5982GPTQ&#xff09;\u901f\u5ea6\u63d0\u5347\u5927&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;INT8\u91cf\u5316\u76f8\u6bd4INT4\u91cf\u5316\u7684\u4f18\u52bf&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;3.56\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;256<br \/>\n\u901f\u5ea6&#xff1a;71.91 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;INT8\u91cf\u5316\u76f8\u6bd4INT4\u91cf\u5316\u7684\u6838\u5fc3\u4f18\u52bf\u5728\u4e8e\u7cbe\u5ea6\u635f\u5931\u66f4\u5c0f&#xff0c;\u5177\u4f53\u4f53\u73b0\u5728&#xff1a;1. \u6570\u503c\u8303\u56f4\u66f4\u5927&#xff1a;INT8\u53ef\u8868\u793a-128~127&#xff0c;INT4\u4ec5-8~7&#8230;<\/p>\n<p>\u5e73\u5747\u901f\u5ea6&#xff1a;74.18 tokens\/\u79d2<br \/>\n\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;8.24 GB<\/p>\n<p>&#061;&#061;&#061; FP16\u975e\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;7.89\u79d2<br \/>\n\u901f\u5ea6&#xff1a;41.19 tokens\/\u79d2<br \/>\n\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;13.87 GB<\/p>\n<h4>4.3 GPTQ\u9759\u6001\u91cf\u5316&#xff08;AutoGPTQ&#xff09;<\/h4>\n<h5>4.3.1 \u4ee3\u7801\u5b9e\u73b0&#xff08;INT4\u91cf\u5316&#xff09;<\/h5>\n<h6>\u4ee3\u7801\u5b9e\u73b0\u6d41\u7a0b\u56fe (GPTQ INT4 Quantization Flow)<\/h6>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u7a0b\u5e8f\u5165\u53e3\u3011: <span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token builtin class-name\">:<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u914d\u7f6e\u4e0e\u51c6\u5907\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Configuration <span class=\"token operator\">&amp;<\/span> Setup<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u73af\u5883\u521d\u59cb\u5316                                               \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u6587\u4ef6<span class=\"token operator\">&gt;<\/span>: .env <span class=\"token punctuation\">(<\/span>\u8bfb\u53d6 MODEL_DIR<span class=\"token punctuation\">)<\/span>                       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u91cf\u5316\u53c2\u6570\u8bbe\u5b9a <span class=\"token punctuation\">(<\/span>BaseQuantizeConfig<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 bits <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">4<\/span> <span class=\"token punctuation\">(<\/span>INT4 \u7cbe\u5ea6&#xff0c;\u6838\u5fc3\u538b\u7f29\u53c2\u6570<span class=\"token punctuation\">)<\/span>                       \u2502<br \/>\n\u2502   \u251c\u2500\u2500 group_size <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">128<\/span> <span class=\"token punctuation\">(<\/span>\u5206\u7ec4\u91cf\u5316&#xff0c;\u5e73\u8861\u7cbe\u5ea6\u4e0e\u663e\u5b58<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502   \u2514\u2500\u2500 desc_act <span class=\"token operator\">&#061;<\/span> False <span class=\"token punctuation\">(<\/span>\u7981\u7528\u6fc0\u6d3b\u91cd\u6392&#xff0c;\u63d0\u5347\u63a8\u7406\u901f\u5ea6<span class=\"token punctuation\">)<\/span>           \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8def\u5f84\u68c0\u67e5: quantized_model_dir \u662f\u5426\u5df2\u5b58\u5728? \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u6a21\u578b\u83b7\u53d6\u7b56\u7565 <span class=\"token punctuation\">(<\/span>Model Acquisition Strategy<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u6838\u5fc3\u5206\u652f<span class=\"token operator\">&gt;<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5206\u652f A<span class=\"token operator\">&gt;<\/span>: \u9996\u6b21\u8fd0\u884c <span class=\"token punctuation\">(<\/span>\u6267\u884c\u91cf\u5316 &#8211; Quantization Phase<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6761\u4ef6<span class=\"token operator\">&gt;<\/span>: \u76ee\u5f55\u4e0d\u5b58\u5728                                     \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">1<\/span>. \u52a0\u8f7d\u539f\u6a21: AutoGPTQForCausalLM.from_pretrained <span class=\"token punctuation\">(<\/span>FP16<span class=\"token punctuation\">)<\/span>\u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">2<\/span>. \u51c6\u5907\u6570\u636e: examples <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;&#8230;&#034;<\/span>, <span class=\"token string\">&#034;&#8230;&#034;<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token punctuation\">(<\/span>\u6821\u51c6\u6570\u636e\u96c6<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">3<\/span>. \u6267\u884c\u91cf\u5316: model.quantize <span class=\"token punctuation\">(<\/span>\u8017\u65f6\u64cd\u4f5c<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8ba1\u7b97<span class=\"token operator\">&gt;<\/span>: \u57fa\u4e8e Hessian \u77e9\u9635\u4f18\u5316\u6743\u91cd&#xff0c;\u6700\u5c0f\u5316\u8bef\u5dee       \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token number\">4<\/span>. \u5e8f\u5217\u5316\u4fdd\u5b58: model.save_quantized <span class=\"token punctuation\">(<\/span>\u751f\u6210 INT4 \u6743\u91cd<span class=\"token punctuation\">)<\/span>    \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5206\u652f B<span class=\"token operator\">&gt;<\/span>: \u540e\u7eed\u8fd0\u884c <span class=\"token punctuation\">(<\/span>\u76f4\u63a5\u52a0\u8f7d &#8211; Loading Phase<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6761\u4ef6<span class=\"token operator\">&gt;<\/span>: \u76ee\u5f55\u5df2\u5b58\u5728                                     \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">1<\/span>. \u5feb\u901f\u52a0\u8f7d: AutoGPTQForCausalLM.from_quantized        \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 use_safetensors <span class=\"token operator\">&#061;<\/span> True <span class=\"token punctuation\">(<\/span>\u5b89\u5168\u5feb\u901f\u52a0\u8f7d<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 device_map <span class=\"token operator\">&#061;<\/span> <span class=\"token string\">&#034;auto&#034;<\/span> <span class=\"token punctuation\">(<\/span>\u81ea\u52a8\u6620\u5c04\u663e\u5b58<span class=\"token punctuation\">)<\/span>                 \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token number\">2<\/span>. \u6ce8\u5165\u6838\u5fc3: GPTQ Kernel <span class=\"token punctuation\">(<\/span>ExLlama\/AutoGPTQ CUDA\u6838\u5fc3<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u6a21\u578b\u5c31\u7eea: model <span class=\"token punctuation\">(<\/span>INT4 \u6743\u91cd &#043; FP16 \u6fc0\u6d3b\u8ba1\u7b97<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u63a8\u7406\u6267\u884c\u6d41\u7a0b <span class=\"token punctuation\">(<\/span>Inference Execution<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u7ba1\u9053\u6784\u5efa <span class=\"token punctuation\">(<\/span>Pipeline Setup<span class=\"token punctuation\">)<\/span>                                \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u914d\u7f6e<span class=\"token operator\">&gt;<\/span>: model.config.use_cache <span class=\"token operator\">&#061;<\/span> True <span class=\"token punctuation\">(<\/span>\u5f00\u542f KV Cache<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5c01\u88c5<span class=\"token operator\">&gt;<\/span>: TextGenerationPipeline <span class=\"token punctuation\">(<\/span>HuggingFace \u7ba1\u9053<span class=\"token punctuation\">)<\/span>       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u751f\u6210\u5faa\u73af <span class=\"token punctuation\">(<\/span>test_gptq_inference<span class=\"token punctuation\">)<\/span>                           \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: Prompts <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;GPTQ\u4f18\u52bf&#8230;&#034;<\/span>, <span class=\"token string\">&#034;\u6ce8\u610f\u4e8b\u9879&#8230;&#034;<\/span><span class=\"token punctuation\">]<\/span>          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u9884\u5904\u7406<span class=\"token operator\">&gt;<\/span>: tokenizer.apply_chat_template                 \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6838\u5fc3\u751f\u6210<span class=\"token operator\">&gt;<\/span>: pipeline<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">)<\/span>                               \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 max_new_tokens <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">512<\/span>                               \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 \u7b97\u5b50\u52a0\u901f: INT4 \u77e9\u9635\u4e58\u6cd5 <span class=\"token punctuation\">(<\/span>W4A16 GEMM<span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 \u663e\u5b58\u4f18\u52bf: \u6743\u91cd\u4f53\u79ef\u7ea6\u4e3a FP16 \u7684 <span class=\"token number\">1<\/span>\/4                  \u2502<br \/>\n\u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u540e\u5904\u7406<span class=\"token operator\">&gt;<\/span>: split<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;&lt;|im_end|&gt;&#034;<\/span><span class=\"token punctuation\">)<\/span> &#8211;<span class=\"token operator\">&gt;<\/span> \u83b7\u53d6\u7eaf\u51c0\u6587\u672c           \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa\u7ed3\u679c: \u6253\u5370 Text <span class=\"token operator\">&amp;<\/span> Speed <span class=\"token punctuation\">(<\/span>tokens\/s<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">4<\/span>. \u6027\u80fd\u76d1\u63a7 <span class=\"token punctuation\">(<\/span>Metrics <span class=\"token operator\">&amp;<\/span> Analysis<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u901f\u5ea6\u7edf\u8ba1                                                \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u8ba1\u7b97: Total Tokens \/ Total Time                        \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u9884\u671f: <span class=\"token number\">100<\/span>&#043; tokens\/s <span class=\"token punctuation\">(<\/span>\u901a\u5e38\u5feb\u4e8e bitsandbytes \u52a8\u6001\u91cf\u5316<span class=\"token punctuation\">)<\/span>  \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u663e\u5b58\u76d1\u63a7                                                \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528<span class=\"token operator\">&gt;<\/span>: torch.cuda.max_memory_allocated<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u9884\u671f: 7B \u6a21\u578b\u4ec5\u9700 ~5GB \u663e\u5b58 <span class=\"token punctuation\">(<\/span>\u5927\u5e45\u4f4e\u4e8e FP16 \u7684 14GB<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<p>import torch<br \/>\nimport time<br \/>\nfrom auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig<br \/>\nfrom transformers import AutoTokenizer, TextGenerationPipeline<br \/>\nimport os<br \/>\nfrom dotenv import load_dotenv<\/p>\n<p>load_dotenv()<\/p>\n<p># 1. \u914d\u7f6eGPTQ\u91cf\u5316\u53c2\u6570<br \/>\nquantize_config &#061; BaseQuantizeConfig(<br \/>\n    bits&#061;4,  # INT4\u91cf\u5316<br \/>\n    group_size&#061;128,  # \u91cf\u5316\u5206\u7ec4\u5927\u5c0f<br \/>\n    desc_act&#061;False,  # \u7981\u7528\u6fc0\u6d3b\u91cf\u5316&#xff08;\u51cf\u5c11\u7cbe\u5ea6\u635f\u5931&#xff09;<br \/>\n    model_name_or_path&#061;&#034;Qwen\/Qwen2.5-7B-Instruct&#034;,<br \/>\n    model_dir&#061;os.getenv(&#034;MODEL_DIR&#034;, &#034;.\/models\/Qwen2.5-7B-Instruct&#034;),<br \/>\n    use_triton&#061;False,  # \u7981\u7528Triton&#xff08;\u517c\u5bb9\u6027\u66f4\u597d&#xff09;<br \/>\n    bitsandbytes_backend&#061;False,<br \/>\n)<\/p>\n<p># 2. \u52a0\u8f7dTokenizer<br \/>\ntokenizer &#061; AutoTokenizer.from_pretrained(<br \/>\n    quantize_config.model_dir if os.path.exists(quantize_config.model_dir) else quantize_config.model_name_or_path,<br \/>\n    trust_remote_code&#061;True,<br \/>\n    use_fast&#061;False<br \/>\n)<\/p>\n<p># 3. \u91cf\u5316\u6a21\u578b&#xff08;\u9996\u6b21\u8fd0\u884c\u9700\u91cf\u5316&#xff0c;\u540e\u7eed\u53ef\u52a0\u8f7d\u91cf\u5316\u540e\u7684\u6a21\u578b&#xff09;<br \/>\nquantized_model_dir &#061; &#034;.\/models\/Qwen2.5-7B-Instruct-GPTQ-4bit&#034;<\/p>\n<p>if not os.path.exists(quantized_model_dir):<br \/>\n    # \u52a0\u8f7d\u539f\u59cb\u6a21\u578b<br \/>\n    model &#061; AutoGPTQForCausalLM.from_pretrained(<br \/>\n        quantize_config.model_dir if os.path.exists(quantize_config.model_dir) else quantize_config.model_name_or_path,<br \/>\n        quantize_config&#061;quantize_config,<br \/>\n        device_map&#061;&#034;auto&#034;,<br \/>\n        trust_remote_code&#061;True,<br \/>\n        torch_dtype&#061;torch.float16,<br \/>\n    )<\/p>\n<p>    # \u51c6\u5907\u91cf\u5316\u6570\u636e\u96c6&#xff08;\u4f7f\u7528\u6837\u4f8b\u6570\u636e&#xff09;<br \/>\n    examples &#061; [<br \/>\n        tokenizer(&#034;\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u6280\u672f\u6709\u54ea\u4e9b&#xff1f;&#034;),<br \/>\n        tokenizer(&#034;KV Cache\u7684\u5de5\u4f5c\u539f\u7406\u662f\u4ec0\u4e48&#xff1f;&#034;),<br \/>\n        tokenizer(&#034;\u5982\u4f55\u4f18\u5316\u91cf\u5316\u540e\u7684\u6a21\u578b\u7cbe\u5ea6&#xff1f;&#034;),<br \/>\n        tokenizer(&#034;\u6a21\u578b\u84b8\u998f\u548c\u91cf\u5316\u7684\u533a\u522b\u662f\u4ec0\u4e48&#xff1f;&#034;),<br \/>\n        tokenizer(&#034;\u5927\u6a21\u578b\u90e8\u7f72\u7684\u663e\u5b58\u4f18\u5316\u7b56\u7565\u6709\u54ea\u4e9b&#xff1f;&#034;)<br \/>\n    ]<\/p>\n<p>    # \u6267\u884c\u91cf\u5316<br \/>\n    start_quant &#061; time.time()<br \/>\n    model.quantize(<br \/>\n        examples,<br \/>\n        batch_size&#061;1,<br \/>\n        use_triton&#061;False<br \/>\n    )<br \/>\n    end_quant &#061; time.time()<br \/>\n    print(f&#034;\u91cf\u5316\u8017\u65f6&#xff1a;{end_quant &#8211; start_quant:.2f}\u79d2&#034;)<\/p>\n<p>    # \u4fdd\u5b58\u91cf\u5316\u540e\u7684\u6a21\u578b<br \/>\n    model.save_quantized(quantized_model_dir)<br \/>\n    tokenizer.save_pretrained(quantized_model_dir)<br \/>\nelse:<br \/>\n    # \u52a0\u8f7d\u5df2\u91cf\u5316\u7684\u6a21\u578b<br \/>\n    start_load &#061; time.time()<br \/>\n    model &#061; AutoGPTQForCausalLM.from_quantized(<br \/>\n        quantized_model_dir,<br \/>\n        device_map&#061;&#034;auto&#034;,<br \/>\n        trust_remote_code&#061;True,<br \/>\n        use_safetensors&#061;True,<br \/>\n        torch_dtype&#061;torch.float16,<br \/>\n    )<br \/>\n    end_load &#061; time.time()<br \/>\n    print(f&#034;\u52a0\u8f7d\u91cf\u5316\u6a21\u578b\u8017\u65f6&#xff1a;{end_load &#8211; start_load:.2f}\u79d2&#034;)<\/p>\n<p># 4. \u542f\u7528KV Cache<br \/>\nmodel.config.use_cache &#061; True<\/p>\n<p># 5. \u521b\u5efa\u63a8\u7406\u7ba1\u9053<br \/>\npipeline &#061; TextGenerationPipeline(<br \/>\n    model&#061;model,<br \/>\n    tokenizer&#061;tokenizer,<br \/>\n    device&#061;model.device,<br \/>\n)<\/p>\n<p># 6. \u6d4b\u8bd5GPTQ\u91cf\u5316\u63a8\u7406<br \/>\ndef test_gptq_inference():<br \/>\n    prompts &#061; [<br \/>\n        &#034;\u8be6\u7ec6\u8bf4\u660eGPTQ\u91cf\u5316\u76f8\u6bd4\u52a8\u6001INT8\u91cf\u5316\u7684\u4f18\u52bf&#034;,<br \/>\n        &#034;INT4 GPTQ\u91cf\u5316\u5728\u5b9e\u9645\u90e8\u7f72\u4e2d\u7684\u6ce8\u610f\u4e8b\u9879&#034;,<br \/>\n        &#034;\u5982\u4f55\u8bc4\u4f30\u91cf\u5316\u6a21\u578b\u7684\u7cbe\u5ea6\u635f\u5931&#034;<br \/>\n    ]<\/p>\n<p>    total_time &#061; 0<br \/>\n    total_tokens &#061; 0<\/p>\n<p>    for prompt in prompts:<br \/>\n        # \u6784\u9020\u8f93\u5165<br \/>\n        messages &#061; [{&#034;role&#034;: &#034;user&#034;, &#034;content&#034;: prompt}]<br \/>\n        input_text &#061; tokenizer.apply_chat_template(<br \/>\n            messages,<br \/>\n            add_generation_prompt&#061;True,<br \/>\n            tokenize&#061;False<br \/>\n        )<\/p>\n<p>        # \u63a8\u7406<br \/>\n        start_time &#061; time.time()<br \/>\n        outputs &#061; pipeline(<br \/>\n            input_text,<br \/>\n            max_new_tokens&#061;512,<br \/>\n            temperature&#061;0.1,<br \/>\n            top_p&#061;0.95,<br \/>\n            do_sample&#061;True,<br \/>\n            use_cache&#061;True,<br \/>\n        )<br \/>\n        end_time &#061; time.time()<\/p>\n<p>        # \u89e3\u6790\u8f93\u51fa<br \/>\n        output_text &#061; outputs[0][&#034;generated_text&#034;].split(&#034;&lt;|im_end|&gt;&#034;)[-2].strip()<br \/>\n        input_tokens &#061; len(tokenizer.encode(input_text))<br \/>\n        output_tokens &#061; len(tokenizer.encode(output_text))<br \/>\n        total_tokens &#043;&#061; output_tokens<br \/>\n        elapsed_time &#061; end_time &#8211; start_time<br \/>\n        total_time &#043;&#061; elapsed_time<\/p>\n<p>        print(f&#034;\\\\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;{prompt[:50]}&#8230; &#061;&#061;&#061;&#034;)<br \/>\n        print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n        print(f&#034;\u751f\u6210Token\u6570&#xff1a;{output_tokens}&#034;)<br \/>\n        print(f&#034;\u901f\u5ea6&#xff1a;{output_tokens\/elapsed_time:.2f} tokens\/\u79d2&#034;)<br \/>\n        print(f&#034;\u8f93\u51fa&#xff1a;{output_text[:200]}&#8230;&#034;)<\/p>\n<p>    # \u6c47\u603b<br \/>\n    avg_speed &#061; total_tokens \/ total_time<br \/>\n    print(f&#034;\\\\nGPTQ INT4\u5e73\u5747\u901f\u5ea6&#xff1a;{avg_speed:.2f} tokens\/\u79d2&#034;)<\/p>\n<p>    # \u663e\u5b58\u4f7f\u7528<br \/>\n    if torch.cuda.is_available():<br \/>\n        mem_used &#061; torch.cuda.max_memory_allocated() \/ 1024 \/ 1024 \/ 1024<br \/>\n        print(f&#034;\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;{mem_used:.2f} GB&#034;)<\/p>\n<p>if __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    print(&#034;&#061;&#061;&#061; GPTQ INT4\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;&#034;)<br \/>\n    test_gptq_inference()<\/p>\n<h5>4.3.2 \u6548\u679c\u9a8c\u8bc1\u8f93\u51fa\u793a\u4f8b<\/h5>\n<p>\u52a0\u8f7d\u91cf\u5316\u6a21\u578b\u8017\u65f6&#xff1a;18.76\u79d2<\/p>\n<p>&#061;&#061;&#061; GPTQ INT4\u91cf\u5316\u63a8\u7406\u6d4b\u8bd5 &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u8be6\u7ec6\u8bf4\u660eGPTQ\u91cf\u5316\u76f8\u6bd4\u52a8\u6001INT8\u91cf\u5316\u7684\u4f18\u52bf&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;2.98\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;345<br \/>\n\u901f\u5ea6&#xff1a;115.77 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;GPTQ&#xff08;GPT Quantization&#xff09;\u662f\u4e00\u79cd\u9759\u6001\u91cf\u5316\u65b9\u6cd5&#xff0c;\u76f8\u6bd4\u52a8\u6001INT8\u91cf\u5316\u6709\u4ee5\u4e0b\u6838\u5fc3\u4f18\u52bf&#xff1a;1. \u901f\u5ea6\u66f4\u5feb&#xff1a;GPTQ\u5728\u91cf\u5316\u65f6\u9884\u8ba1\u7b97\u6743\u91cd\u7f29\u653e\u56e0\u5b50&#xff0c;\u63a8\u7406\u65f6\u65e0\u9700\u52a8\u6001\u8ba1\u7b97&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;INT4 GPTQ\u91cf\u5316\u5728\u5b9e\u9645\u90e8\u7f72\u4e2d\u7684\u6ce8\u610f\u4e8b\u9879&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;2.76\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;312<br \/>\n\u901f\u5ea6&#xff1a;113.04 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;INT4 GPTQ\u91cf\u5316\u90e8\u7f72\u9700\u6ce8\u610f\u4ee5\u4e0b\u4e8b\u9879&#xff1a;1. \u7cbe\u5ea6\u76d1\u63a7&#xff1a;INT4\u91cf\u5316\u7cbe\u5ea6\u635f\u5931\u7565\u9ad8\u4e8eINT8&#xff0c;\u9700\u9488\u5bf9\u6838\u5fc3\u573a\u666f\u505a\u7cbe\u5ea6\u9a8c\u8bc1&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u5982\u4f55\u8bc4\u4f30\u91cf\u5316\u6a21\u578b\u7684\u7cbe\u5ea6\u635f\u5931&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;2.54\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;289<br \/>\n\u901f\u5ea6&#xff1a;113.78 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;\u8bc4\u4f30\u91cf\u5316\u6a21\u578b\u7cbe\u5ea6\u635f\u5931\u7684\u65b9\u6cd5\u5305\u62ec&#xff1a;1. \u57fa\u51c6\u6d4b\u8bd5&#xff1a;\u5728\u6807\u51c6\u6570\u636e\u96c6&#xff08;\u5982MMLU\u3001C-Eval&#xff09;\u4e0a\u5bf9\u6bd4\u539f\u59cb\u6a21\u578b\u548c\u91cf\u5316\u6a21\u578b\u7684\u51c6\u786e\u7387&#8230;<\/p>\n<p>GPTQ INT4\u5e73\u5747\u901f\u5ea6&#xff1a;114.19 tokens\/\u79d2<br \/>\n\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;4.87 GB<\/p>\n<h3>\u4e94\u3001\u6a21\u578b\u84b8\u998f\u5b9e\u6218&#xff08;\u4ece7B\u84b8\u998f\u52301.8B&#xff09;<\/h3>\n<h4>5.1 \u6a21\u578b\u84b8\u998f\u6838\u5fc3\u539f\u7406<\/h4>\n<p>\u6a21\u578b\u84b8\u998f\u662f\u5c06\u5927\u6a21\u578b&#xff08;\u6559\u5e08\u6a21\u578b&#xff0c;\u5982Qwen2.5-7B&#xff09;\u7684\u77e5\u8bc6\u8fc1\u79fb\u5230\u5c0f\u6a21\u578b&#xff08;\u5b66\u751f\u6a21\u578b&#xff0c;\u5982Qwen2.5-1.8B&#xff09;\u7684\u8fc7\u7a0b&#xff0c;\u6838\u5fc3\u5305\u62ec&#xff1a;<\/p>\n<ul>\n<li>\n<p>\u77e5\u8bc6\u84b8\u998f&#xff1a;\u5339\u914d\u6559\u5e08\u6a21\u578b\u548c\u5b66\u751f\u6a21\u578b\u7684\u8f93\u51fa\u5206\u5e03&#xff08;Soft Target&#xff09;<\/p>\n<\/li>\n<li>\n<p>\u4e2d\u95f4\u5c42\u84b8\u998f&#xff1a;\u5339\u914d\u6559\u5e08\u6a21\u578b\u548c\u5b66\u751f\u6a21\u578b\u7684\u9690\u85cf\u5c42\u7279\u5f81<\/p>\n<\/li>\n<li>\n<p>\u5bf9\u8bdd\u84b8\u998f&#xff1a;\u9488\u5bf9\u5bf9\u8bdd\u573a\u666f\u7684\u4e13\u7528\u84b8\u998f\u7b56\u7565&#xff08;\u4fdd\u7559\u591a\u8f6e\u4ea4\u4e92\u80fd\u529b&#xff09;<\/p>\n<\/li>\n<\/ul>\n<h4>5.2 \u84b8\u998f\u73af\u5883\u914d\u7f6e<\/h4>\n<p># \u989d\u5916\u5b89\u88c5\u84b8\u998f\u4f9d\u8d56<br \/>\npip install trl&#061;&#061;0.8.6 peft&#061;&#061;0.11.1 accelerate&#061;&#061;0.31.0<br \/>\npip install datasets&#061;&#061;2.20.0 evaluate&#061;&#061;0.4.2<\/p>\n<h4>5.3 \u4ee3\u7801\u5b9e\u73b0&#xff08;Qwen2.5-7B \u2192 Qwen2.5-1.8B&#xff09;<\/h4>\n<h5>5.3.1 \u5b8c\u6574\u84b8\u998f\u4ee3\u7801<\/h5>\n<h6>\u5b8c\u6574\u84b8\u998f\u6d41\u7a0b\u56fe (Knowledge Distillation Flow)<\/h6>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u7a0b\u5e8f\u5165\u53e3\u3011: <span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token builtin class-name\">:<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u57fa\u7840\u67b6\u6784\u6784\u5efa <span class=\"token punctuation\">(<\/span>Infrastructure Setup<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 A. \u5168\u5c40\u914d\u7f6e <span class=\"token punctuation\">(<\/span>Config <span class=\"token operator\">&amp;<\/span> Environment<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u914d\u7f6e<span class=\"token operator\">&gt;<\/span>: DistillationConfig                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u53c2\u6570\u8bbe\u5b9a<span class=\"token operator\">&gt;<\/span>: <span class=\"token assign-left variable\">Temperature<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">2.0<\/span> <span class=\"token punctuation\">(<\/span>\u8f6f\u5316\u6982\u7387\u5206\u5e03<span class=\"token punctuation\">)<\/span>, <span class=\"token assign-left variable\">Alpha<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.3<\/span>      \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u786c\u4ef6\u68c0\u6d4b<span class=\"token operator\">&gt;<\/span>: <span class=\"token assign-left variable\">device<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;cuda&#034;<\/span> <span class=\"token punctuation\">(<\/span>FP16 \u6df7\u5408\u7cbe\u5ea6\u5f00\u542f<span class=\"token punctuation\">)<\/span>               \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 B. \u53cc\u6a21\u578b\u52a0\u8f7d <span class=\"token punctuation\">(<\/span>Dual-Model Loading<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 &#x1f393; \u6559\u5e08\u6a21\u578b <span class=\"token punctuation\">(<\/span>Teacher Model<span class=\"token punctuation\">)<\/span>                                \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;Qwen2.5-7B-Instruct&#034;<\/span>                          \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u72b6\u6001<span class=\"token operator\">&gt;<\/span>: model.eval<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span> &#043; <span class=\"token assign-left variable\">requires_grad<\/span><span class=\"token operator\">&#061;<\/span>False <span class=\"token punctuation\">(<\/span>\u51bb\u7ed3\u6743\u91cd<span class=\"token punctuation\">)<\/span>  \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u4f5c\u7528: \u63d0\u4f9b\u9ad8\u8d28\u91cf\u7684 Logits <span class=\"token punctuation\">(<\/span>Soft Targets<span class=\"token punctuation\">)<\/span>             \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 &#x1f392; \u5b66\u751f\u6a21\u578b <span class=\"token punctuation\">(<\/span>Student Model<span class=\"token punctuation\">)<\/span>                                \u2502<br \/>\n\u2502       \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;Qwen2.5-1.8B-Instruct&#034;<\/span>                        \u2502<br \/>\n\u2502       \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u4f18\u5316<span class=\"token operator\">&gt;<\/span>: PEFT \/ LoRA <span class=\"token punctuation\">(<\/span>\u4f4e\u79e9\u9002\u914d<span class=\"token punctuation\">)<\/span>                         \u2502<br \/>\n\u2502       \u2502   \u251c\u2500\u2500 <span class=\"token assign-left variable\">r<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">64<\/span>, <span class=\"token assign-left variable\">alpha<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">128<\/span>, <span class=\"token assign-left variable\">modules<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;q_proj&#034;<\/span>, <span class=\"token string\">&#034;v_proj&#034;<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">]<\/span>   \u2502<br \/>\n\u2502       \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: \u4ec5\u8bad\u7ec3 <span class=\"token number\">1.04<\/span>% \u7684\u53c2\u6570 <span class=\"token punctuation\">(<\/span>\u8f7b\u91cf\u5316\u8bad\u7ec3<span class=\"token punctuation\">)<\/span>            \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u4f5c\u7528: \u5b66\u4e60\u6559\u5e08\u7684\u6982\u7387\u5206\u5e03 &#043; \u771f\u5b9e\u6807\u7b7e                   \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u51c6\u5907\u5c31\u7eea: Tokenizer <span class=\"token punctuation\">(<\/span>pad_token<span class=\"token operator\">&#061;<\/span>eos_token<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u77e5\u8bc6\u63d0\u53d6\u4e0e\u6570\u636e\u6d41 <span class=\"token punctuation\">(<\/span>Knowledge Extraction Pipeline<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;YeungNLP\/firefly-train-1.1M&#034;<\/span> <span class=\"token punctuation\">(<\/span>\u6307\u4ee4\u5fae\u8c03\u6570\u636e\u96c6<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 \u21bb \u9884\u5904\u7406\u5faa\u73af <span class=\"token punctuation\">(<\/span>preprocess_function<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u251c\u2500\u2500 A. \u683c\u5f0f\u5316 <span class=\"token punctuation\">(<\/span>Formatting<span class=\"token punctuation\">)<\/span>                                     \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6a21\u7248<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;&lt;|im_start|&gt;user&#8230;&lt;|im_end|&gt;&#034;<\/span> <span class=\"token punctuation\">(<\/span>ChatML\u683c\u5f0f<span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u251c\u2500\u2500 B. \u6559\u5e08\u63a8\u7406 <span class=\"token punctuation\">(<\/span>Teacher Forward Pass<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u6838\u5fc3\u6b65\u9aa4<span class=\"token operator\">&gt;<\/span>            \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: Tokenized Input IDs                            \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8ba1\u7b97<span class=\"token operator\">&gt;<\/span>: with torch.no_grad<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>: teacher_model<span class=\"token punctuation\">(<\/span>input<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: Teacher Logits <span class=\"token punctuation\">(<\/span>\u77e5\u8bc6\u7684\u8f7d\u4f53<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 C. \u6570\u636e\u6253\u5305 <span class=\"token punctuation\">(<\/span>Packaging<span class=\"token punctuation\">)<\/span>                                    \u2502<br \/>\n\u2502       \u251c\u2500\u2500 labels: \u771f\u5b9e\u6807\u7b7e <span class=\"token punctuation\">(<\/span>Hard Targets<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502       \u2514\u2500\u2500 teacher_logits: \u6559\u5e08\u8f93\u51fa <span class=\"token punctuation\">(<\/span>Soft Targets<span class=\"token punctuation\">)<\/span>                \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: \u5305\u542b\u53cc\u91cd\u76d1\u7763\u4fe1\u53f7\u7684 DataLoader \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u84b8\u998f\u8bad\u7ec3\u5faa\u73af <span class=\"token punctuation\">(<\/span>Distillation Training Loop<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u7c7b<span class=\"token operator\">&gt;<\/span>: DistillationTrainer <span class=\"token punctuation\">(<\/span>\u7ee7\u627f\u81ea SFTTrainer<span class=\"token punctuation\">)<\/span>                  \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 \u21bb \u8bad\u7ec3\u6b65 <span class=\"token punctuation\">(<\/span>Training Step<span class=\"token punctuation\">)<\/span>                                       \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">1<\/span>. \u5b66\u751f\u63a8\u7406 <span class=\"token punctuation\">(<\/span>Student Forward<span class=\"token punctuation\">)<\/span>                              \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: batch<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;input_ids&#034;<\/span><span class=\"token punctuation\">]<\/span>                             \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa: Student Logits                                 \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">2<\/span>. \u635f\u5931\u8ba1\u7b97 <span class=\"token punctuation\">(<\/span>Compute Loss<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u84b8\u998f\u6838\u5fc3\u516c\u5f0f<span class=\"token operator\">&gt;<\/span>                \u2502<br \/>\n\u2502   \u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 L_soft <span class=\"token punctuation\">(<\/span>\u77e5\u8bc6\u5339\u914d\u635f\u5931<span class=\"token punctuation\">)<\/span>: KL Divergence                   \u2502<br \/>\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 KL<span class=\"token punctuation\">(<\/span>Student_Logits\/T <span class=\"token operator\">||<\/span> Teacher_Logits\/T<span class=\"token punctuation\">)<\/span>           \u2502<br \/>\n\u2502   \u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u76ee\u7684: \u8ba9\u5b66\u751f\u6a21\u4eff\u6559\u5e08\u7684\u201c\u6697\u77e5\u8bc6\u201d<span class=\"token punctuation\">(<\/span>\u8f93\u51fa\u5206\u5e03<span class=\"token punctuation\">)<\/span>           \u2502<br \/>\n\u2502   \u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 L_hard <span class=\"token punctuation\">(<\/span>\u57fa\u7840\u4efb\u52a1\u635f\u5931<span class=\"token punctuation\">)<\/span>: Cross Entropy                   \u2502<br \/>\n\u2502   \u2502   \u2502   \u251c\u2500\u2500 CE<span class=\"token punctuation\">(<\/span>Student_Logits, True_Labels<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502   \u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u76ee\u7684: \u4fdd\u8bc1\u751f\u6210\u7684\u51c6\u786e\u6027                           \u2502<br \/>\n\u2502   \u2502   \u2502                                                          \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u603bLoss <span class=\"token operator\">&#061;<\/span> <span class=\"token number\">0.7<\/span> * L_hard &#043; <span class=\"token number\">0.3<\/span> * L_soft                 \u2502<br \/>\n\u2502   \u2502                                                              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token number\">3<\/span>. \u53cd\u5411\u4f20\u64ad <span class=\"token punctuation\">(<\/span>Backprop<span class=\"token punctuation\">)<\/span>                                     \u2502<br \/>\n\u2502       \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u66f4\u65b0<span class=\"token operator\">&gt;<\/span>: \u4ec5\u66f4\u65b0\u5b66\u751f\u6a21\u578b\u7684 LoRA \u9002\u914d\u5c42\u6743\u91cd                \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: \u8bad\u7ec3\u5b8c\u6210\u7684\u6a21\u578b\u4fdd\u5b58\u81f3 output_dir \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">4<\/span>. \u9a8c\u8bc1\u4e0e\u6d4b\u8bd5\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Validation <span class=\"token operator\">&amp;<\/span> Testing<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 A. \u6a21\u578b\u52a0\u8f7d                                                    \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8bfb\u53d6<span class=\"token operator\">&gt;<\/span>: output_dir <span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;.\/models\/Qwen2.5-1.8B-Distilled&#034;<\/span><span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5408\u5e76<span class=\"token operator\">&gt;<\/span>: Base Model &#043; LoRA Weights                          \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u251c\u2500\u2500 B. \u63a8\u7406\u9a8c\u8bc1 <span class=\"token punctuation\">(<\/span>test_distilled_model<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8f93\u5165<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u6280\u672f\u6709\u54ea\u4e9b&#xff1f;&#034;<\/span>                 \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u751f\u6210<span class=\"token operator\">&gt;<\/span>: model.generate <span class=\"token punctuation\">(<\/span>use_cache<span class=\"token operator\">&#061;<\/span>True<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u89c2\u6d4b: \u68c0\u67e5 <span class=\"token number\">1<\/span>.8B \u6a21\u578b\u662f\u5426\u5b66\u4f1a\u4e86 7B \u6a21\u578b\u7684\u903b\u8f91\u8868\u8fbe\u80fd\u529b      \u2502<br \/>\n\u2502                                                                  \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u6700\u7ec8\u4ea7\u51fa: \u8f7b\u91cf\u5316\u3001\u9ad8\u667a\u5546\u7684\u8fb9\u7f18\u7aef\u6a21\u578b \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<h6>\u6d41\u7a0b\u56fe\u5173\u952e\u70b9\u89e3\u6790 (Key Highlights)<\/h6>\n<li>\n<p>\u53cc\u6a21\u578b\u67b6\u6784 (Dual-Model)&#xff1a;\u56fe\u8868\u6e05\u6670\u533a\u5206\u4e86 Teacher&#xff08;\u53ea\u8bfb\/\u51bb\u7ed3&#xff09;\u548c Student&#xff08;\u53ef\u5199\/\u8bad\u7ec3&#xff09;\u7684\u72b6\u6001\u3002<\/p>\n<\/li>\n<li>\n<p>Soft Targets \u751f\u6210\u4f4d\u7f6e&#xff1a;\u6839\u636e\u60a8\u7684\u4ee3\u7801&#xff0c;Teacher Logits \u662f\u5728 \u6570\u636e\u9884\u5904\u7406\u9636\u6bb5 (Phase 2) \u751f\u6210\u7684&#xff0c;\u8fd9\u610f\u5473\u7740\u8bad\u7ec3\u65f6 GPU \u663e\u5b58\u538b\u529b\u8f83\u5c0f&#xff08;\u56e0\u4e3a\u4e0d\u9700\u8981\u540c\u65f6\u628a\u4e24\u4e2a\u5b8c\u6574\u5927\u6a21\u578b\u653e\u8fdb\u663e\u5b58\u505a\u52a8\u6001\u524d\u5411\u4f20\u64ad&#xff09;&#xff0c;\u8fd9\u662f\u4e00\u79cd\u9ad8\u6548\u7684 Offline Distillation&#xff08;\u79bb\u7ebf\u84b8\u998f&#xff09; \u6216 Cached Distillation \u7b56\u7565\u3002<\/p>\n<\/li>\n<li>\n<p>\u6df7\u5408\u635f\u5931\u51fd\u6570 (Hybrid Loss)&#xff1a;\u5728 Phase 3 \u4e2d&#xff0c;\u5c55\u793a\u4e86 <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>           L<\/p>\n<p>           o<\/p>\n<p>           s<\/p>\n<p>           s<\/p>\n<p>           &#061;<\/p>\n<p>           0.7<\/p>\n<p>           \u00d7<\/p>\n<p>           C<\/p>\n<p>           E<\/p>\n<p>           &#043;<\/p>\n<p>           0.3<\/p>\n<p>           \u00d7<\/p>\n<p>           K<\/p>\n<p>           L<\/p>\n<p>           Loss &#061; 0.7 \\\\times CE &#043; 0.3 \\\\times KL <\/p>\n<p>       <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\">L<\/span><span class=\"mord mathnormal\">oss<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">0.7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7667em;vertical-align: -0.0833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0576em\">CE<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">&#043;<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7278em;vertical-align: -0.0833em\"><\/span><span class=\"mord\">0.3<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u00d7<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathnormal\" style=\"margin-right: 0.0715em\">K<\/span><span class=\"mord mathnormal\">L<\/span><\/span><\/span><\/span><\/span><\/span> \u7684\u8ba1\u7b97\u903b\u8f91&#xff0c;\u8fd9\u662f\u77e5\u8bc6\u84b8\u998f\u7684\u6570\u5b66\u672c\u8d28\u3002<\/p>\n<\/li>\n<p>import os<br \/>\nimport torch<br \/>\nimport time<br \/>\nfrom dataclasses import dataclass, field<br \/>\nfrom typing import Optional<br \/>\nimport transformers<br \/>\nfrom transformers import (<br \/>\n    AutoModelForCausalLM,<br \/>\n    AutoTokenizer,<br \/>\n    TrainingArguments,<br \/>\n    Trainer,<br \/>\n    DataCollatorForLanguageModeling,<br \/>\n)<br \/>\nfrom trl import SFTTrainer, DataCollatorForCompletionOnlyLM<br \/>\nfrom datasets import load_dataset<br \/>\nimport peft<br \/>\nfrom peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training<br \/>\nfrom dotenv import load_dotenv<\/p>\n<p>load_dotenv()<\/p>\n<p># 1. \u914d\u7f6e\u53c2\u6570<br \/>\n&#064;dataclass<br \/>\nclass DistillationConfig:<br \/>\n    # \u6a21\u578b\u914d\u7f6e<br \/>\n    teacher_model_name: str &#061; &#034;Qwen\/Qwen2.5-7B-Instruct&#034;<br \/>\n    student_model_name: str &#061; &#034;Qwen\/Qwen2.5-1.8B-Instruct&#034;<br \/>\n    model_dir: str &#061; os.getenv(&#034;MODEL_DIR&#034;, &#034;.\/models&#034;)<br \/>\n    output_dir: str &#061; &#034;.\/models\/Qwen2.5-1.8B-Distilled&#034;<br \/>\n    # \u6570\u636e\u914d\u7f6e<br \/>\n    dataset_name: str &#061; &#034;YeungNLP\/firefly-train-1.1M&#034;  # \u4e2d\u6587\u5bf9\u8bdd\u6570\u636e\u96c6<br \/>\n    max_seq_length: int &#061; 1024<br \/>\n    # \u8bad\u7ec3\u914d\u7f6e<br \/>\n    batch_size: int &#061; 4<br \/>\n    gradient_accumulation_steps: int &#061; 4<br \/>\n    learning_rate: float &#061; 2e-4<br \/>\n    num_train_epochs: int &#061; 3<br \/>\n    lora_rank: int &#061; 64<br \/>\n    lora_alpha: int &#061; 128<br \/>\n    lora_dropout: float &#061; 0.05<br \/>\n    # \u786c\u4ef6\u914d\u7f6e<br \/>\n    device: str &#061; &#034;cuda&#034; if torch.cuda.is_available() else &#034;cpu&#034;<br \/>\n    fp16: bool &#061; True<br \/>\n    gradient_checkpointing: bool &#061; True<\/p>\n<p># 2. \u52a0\u8f7d\u914d\u7f6e<br \/>\nconfig &#061; DistillationConfig()<\/p>\n<p># 3. \u52a0\u8f7dTokenizer<br \/>\ntokenizer &#061; AutoTokenizer.from_pretrained(<br \/>\n    os.path.join(config.model_dir, config.student_model_name.split(&#034;\/&#034;)[-1])<br \/>\n    if os.path.exists(os.path.join(config.model_dir, config.student_model_name.split(&#034;\/&#034;)[-1]))<br \/>\n    else config.student_model_name,<br \/>\n    trust_remote_code&#061;True,<br \/>\n    use_fast&#061;False,<br \/>\n    padding_side&#061;&#034;right&#034;,<br \/>\n)<br \/>\ntokenizer.pad_token &#061; tokenizer.eos_token<\/p>\n<p># 4. \u52a0\u8f7d\u6559\u5e08\u6a21\u578b&#xff08;\u7528\u4e8e\u751f\u6210Soft Target&#xff09;<br \/>\nprint(&#034;&#061;&#061;&#061; \u52a0\u8f7d\u6559\u5e08\u6a21\u578b &#061;&#061;&#061;&#034;)<br \/>\nteacher_model &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n    os.path.join(config.model_dir, config.teacher_model_name.split(&#034;\/&#034;)[-1])<br \/>\n    if os.path.exists(os.path.join(config.model_dir, config.teacher_model_name.split(&#034;\/&#034;)[-1]))<br \/>\n    else config.teacher_model_name,<br \/>\n    device_map&#061;&#034;auto&#034;,<br \/>\n    trust_remote_code&#061;True,<br \/>\n    torch_dtype&#061;torch.float16,<br \/>\n    low_cpu_mem_usage&#061;True,<br \/>\n)<br \/>\nteacher_model.eval()<\/p>\n<p># 5. \u52a0\u8f7d\u5b66\u751f\u6a21\u578b&#xff08;\u5f85\u84b8\u998f&#xff09;<br \/>\nprint(&#034;&#061;&#061;&#061; \u52a0\u8f7d\u5b66\u751f\u6a21\u578b &#061;&#061;&#061;&#034;)<br \/>\nstudent_model &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n    os.path.join(config.model_dir, config.student_model_name.split(&#034;\/&#034;)[-1])<br \/>\n    if os.path.exists(os.path.join(config.model_dir, config.student_model_name.split(&#034;\/&#034;)[-1]))<br \/>\n    else config.student_model_name,<br \/>\n    device_map&#061;&#034;auto&#034;,<br \/>\n    trust_remote_code&#061;True,<br \/>\n    torch_dtype&#061;torch.float16,<br \/>\n    low_cpu_mem_usage&#061;True,<br \/>\n)<\/p>\n<p># 6. \u914d\u7f6eLoRA&#xff08;\u4f4e\u79e9\u9002\u914d&#xff0c;\u51cf\u5c11\u8bad\u7ec3\u53c2\u6570&#xff09;<br \/>\nlora_config &#061; LoraConfig(<br \/>\n    r&#061;config.lora_rank,<br \/>\n    lora_alpha&#061;config.lora_alpha,<br \/>\n    target_modules&#061;[&#034;q_proj&#034;, &#034;k_proj&#034;, &#034;v_proj&#034;, &#034;o_proj&#034;, &#034;gate_proj&#034;, &#034;up_proj&#034;, &#034;down_proj&#034;],<br \/>\n    lora_dropout&#061;config.lora_dropout,<br \/>\n    bias&#061;&#034;none&#034;,<br \/>\n    task_type&#061;&#034;CAUSAL_LM&#034;,<br \/>\n)<br \/>\nstudent_model &#061; get_peft_model(student_model, lora_config)<br \/>\nstudent_model.print_trainable_parameters()<\/p>\n<p># 7. \u52a0\u8f7d\u5e76\u9884\u5904\u7406\u6570\u636e\u96c6<br \/>\ndef load_and_preprocess_data():<br \/>\n    # \u52a0\u8f7d\u6570\u636e\u96c6<br \/>\n    dataset &#061; load_dataset(config.dataset_name, split&#061;&#034;train[:10%]&#034;)  # \u53d610%\u6570\u636e\u7528\u4e8e\u6f14\u793a<\/p>\n<p>    # \u9884\u5904\u7406\u51fd\u6570<br \/>\n    def preprocess_function(examples):<br \/>\n        # \u6784\u9020\u5bf9\u8bdd\u683c\u5f0f<br \/>\n        texts &#061; []<br \/>\n        for instruction, input_text, output_text in zip(examples[&#034;instruction&#034;], examples[&#034;input&#034;], examples[&#034;output&#034;]):<br \/>\n            if input_text:<br \/>\n                prompt &#061; f&#034;&lt;|im_start|&gt;user\\\\n{instruction}\\\\n{input_text}&lt;|im_end|&gt;\\\\n&lt;|im_start|&gt;assistant\\\\n&#034;<br \/>\n            else:<br \/>\n                prompt &#061; f&#034;&lt;|im_start|&gt;user\\\\n{instruction}&lt;|im_end|&gt;\\\\n&lt;|im_start|&gt;assistant\\\\n&#034;<br \/>\n            full_text &#061; prompt &#043; output_text &#043; &#034;&lt;|im_end|&gt;&#034;<br \/>\n            texts.append(full_text)<\/p>\n<p>        # \u5206\u8bcd<br \/>\n        tokenized &#061; tokenizer(<br \/>\n            texts,<br \/>\n            truncation&#061;True,<br \/>\n            max_length&#061;config.max_seq_length,<br \/>\n            padding&#061;&#034;max_length&#034;,<br \/>\n            return_tensors&#061;&#034;pt&#034;,<br \/>\n        )<\/p>\n<p>        # \u751f\u6210\u6559\u5e08\u6a21\u578b\u7684Soft Target&#xff08;\u4ec5\u8ba1\u7b97\u4e00\u6b21&#xff09;<br \/>\n        with torch.no_grad():<br \/>\n            teacher_logits &#061; teacher_model(<br \/>\n                input_ids&#061;tokenized[&#034;input_ids&#034;].to(config.device),<br \/>\n                attention_mask&#061;tokenized[&#034;attention_mask&#034;].to(config.device)<br \/>\n            ).logits<\/p>\n<p>        # \u6784\u9020\u8bad\u7ec3\u6570\u636e<br \/>\n        tokenized[&#034;labels&#034;] &#061; tokenized[&#034;input_ids&#034;].clone()<br \/>\n        tokenized[&#034;teacher_logits&#034;] &#061; teacher_logits.cpu()<\/p>\n<p>        return tokenized<\/p>\n<p>    # \u9884\u5904\u7406\u6570\u636e\u96c6<br \/>\n    tokenized_dataset &#061; dataset.map(<br \/>\n        preprocess_function,<br \/>\n        batched&#061;True,<br \/>\n        batch_size&#061;config.batch_size,<br \/>\n        remove_columns&#061;dataset.column_names,<br \/>\n    )<\/p>\n<p>    # \u5212\u5206\u8bad\u7ec3\u96c6\u548c\u9a8c\u8bc1\u96c6<br \/>\n    split_dataset &#061; tokenized_dataset.train_test_split(test_size&#061;0.1)<br \/>\n    return split_dataset[&#034;train&#034;], split_dataset[&#034;test&#034;]<\/p>\n<p># 8. \u81ea\u5b9a\u4e49\u84b8\u998f\u635f\u5931\u51fd\u6570<br \/>\nclass DistillationTrainer(SFTTrainer):<br \/>\n    def compute_loss(self, model, inputs, return_outputs&#061;False):<br \/>\n        # \u83b7\u53d6\u5b66\u751f\u6a21\u578b\u8f93\u51fa<br \/>\n        student_outputs &#061; model(<br \/>\n            input_ids&#061;inputs[&#034;input_ids&#034;].to(config.device),<br \/>\n            attention_mask&#061;inputs[&#034;attention_mask&#034;].to(config.device),<br \/>\n            labels&#061;inputs[&#034;labels&#034;].to(config.device),<br \/>\n        )<br \/>\n        student_logits &#061; student_outputs.logits<\/p>\n<p>        # \u83b7\u53d6\u6559\u5e08\u6a21\u578bLogits<br \/>\n        teacher_logits &#061; inputs[&#034;teacher_logits&#034;].to(config.device)<\/p>\n<p>        # \u84b8\u998f\u635f\u5931&#xff1a;KL\u6563\u5ea6&#xff08;\u5339\u914d\u5206\u5e03&#xff09; &#043; \u4ea4\u53c9\u71b5&#xff08;\u57fa\u7840\u635f\u5931&#xff09;<br \/>\n        kl_loss &#061; torch.nn.functional.kl_div(<br \/>\n            torch.nn.functional.log_softmax(student_logits \/ 2.0, dim&#061;-1),<br \/>\n            torch.nn.functional.softmax(teacher_logits \/ 2.0, dim&#061;-1),<br \/>\n            reduction&#061;&#034;batchmean&#034;,<br \/>\n            log_target&#061;False,<br \/>\n        ) * (2.0 **2)<\/p>\n<p>        ce_loss &#061; student_outputs.loss<br \/>\n        total_loss &#061; 0.7 * ce_loss &#043; 0.3 * kl_loss  # \u6743\u91cd\u53ef\u8c03<\/p>\n<p>        return (total_loss, student_outputs) if return_outputs else total_loss<\/p>\n<p># 9. \u914d\u7f6e\u8bad\u7ec3\u53c2\u6570<br \/>\ntraining_args &#061; TrainingArguments(<br \/>\n    output_dir&#061;config.output_dir,<br \/>\n    per_device_train_batch_size&#061;config.batch_size,<br \/>\n    gradient_accumulation_steps&#061;config.gradient_accumulation_steps,<br \/>\n    learning_rate&#061;config.learning_rate,<br \/>\n    num_train_epochs&#061;config.num_train_epochs,<br \/>\n    fp16&#061;config.fp16,<br \/>\n    gradient_checkpointing&#061;config.gradient_checkpointing,<br \/>\n    save_strategy&#061;&#034;epoch&#034;,<br \/>\n    evaluation_strategy&#061;&#034;epoch&#034;,<br \/>\n    logging_steps&#061;10,<br \/>\n    save_steps&#061;100,<br \/>\n    eval_steps&#061;100,<br \/>\n    load_best_model_at_end&#061;True,<br \/>\n    metric_for_best_model&#061;&#034;loss&#034;,<br \/>\n    greater_is_better&#061;False,<br \/>\n    push_to_hub&#061;False,<br \/>\n    report_to&#061;&#034;none&#034;,<br \/>\n)<\/p>\n<p># 10. \u6570\u636e\u6536\u96c6\u5668<br \/>\ndata_collator &#061; DataCollatorForCompletionOnlyLM(<br \/>\n    tokenizer&#061;tokenizer,<br \/>\n    response_template&#061;&#034;&lt;|im_start|&gt;assistant\\\\n&#034;,<br \/>\n    mlm&#061;False,<br \/>\n)<\/p>\n<p># 11. \u5f00\u59cb\u84b8\u998f\u8bad\u7ec3<br \/>\ndef run_distillation():<br \/>\n    # \u52a0\u8f7d\u6570\u636e<br \/>\n    train_dataset, eval_dataset &#061; load_and_preprocess_data()<\/p>\n<p>    # \u521b\u5efaTrainer<br \/>\n    trainer &#061; DistillationTrainer(<br \/>\n        model&#061;student_model,<br \/>\n        args&#061;training_args,<br \/>\n        train_dataset&#061;train_dataset,<br \/>\n        eval_dataset&#061;eval_dataset,<br \/>\n        tokenizer&#061;tokenizer,<br \/>\n        data_collator&#061;data_collator,<br \/>\n        peft_config&#061;lora_config,<br \/>\n        max_seq_length&#061;config.max_seq_length,<br \/>\n    )<\/p>\n<p>    # \u5f00\u59cb\u8bad\u7ec3<br \/>\n    start_train &#061; time.time()<br \/>\n    trainer.train()<br \/>\n    end_train &#061; time.time()<br \/>\n    print(f&#034;\u84b8\u998f\u8bad\u7ec3\u8017\u65f6&#xff1a;{end_train &#8211; start_train:.2f}\u79d2&#034;)<\/p>\n<p>    # \u4fdd\u5b58\u6a21\u578b<br \/>\n    trainer.save_model(config.output_dir)<br \/>\n    tokenizer.save_pretrained(config.output_dir)<br \/>\n    print(f&#034;\u84b8\u998f\u6a21\u578b\u4fdd\u5b58\u81f3&#xff1a;{config.output_dir}&#034;)<\/p>\n<p># 12. \u6d4b\u8bd5\u84b8\u998f\u540e\u6a21\u578b<br \/>\ndef test_distilled_model():<br \/>\n    # \u52a0\u8f7d\u84b8\u998f\u540e\u7684\u6a21\u578b<br \/>\n    distilled_model &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n        config.output_dir,<br \/>\n        device_map&#061;&#034;auto&#034;,<br \/>\n        trust_remote_code&#061;True,<br \/>\n        torch_dtype&#061;torch.float16,<br \/>\n    )<br \/>\n    distilled_model.config.use_cache &#061; True  # \u542f\u7528KV Cache<\/p>\n<p>    # \u6d4b\u8bd5\u7528\u4f8b<br \/>\n    prompts &#061; [<br \/>\n        &#034;\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u6280\u672f\u6709\u54ea\u4e9b&#xff1f;&#034;,<br \/>\n        &#034;KV Cache\u548c\u91cf\u5316\u6280\u672f\u7684\u533a\u522b\u662f\u4ec0\u4e48&#xff1f;&#034;,<br \/>\n        &#034;\u5982\u4f55\u5728\u8fb9\u7f18\u8bbe\u5907\u90e8\u7f72\u5927\u6a21\u578b&#xff1f;&#034;<br \/>\n    ]<\/p>\n<p>    total_time &#061; 0<br \/>\n    total_tokens &#061; 0<\/p>\n<p>    for prompt in prompts:<br \/>\n        # \u6784\u9020\u8f93\u5165<br \/>\n        messages &#061; [{&#034;role&#034;: &#034;user&#034;, &#034;content&#034;: prompt}]<br \/>\n        input_ids &#061; tokenizer.apply_chat_template(<br \/>\n            messages,<br \/>\n            tokenize&#061;True,<br \/>\n            add_generation_prompt&#061;True,<br \/>\n            return_tensors&#061;&#034;pt&#034;<br \/>\n        ).to(config.device)<\/p>\n<p>        # \u63a8\u7406<br \/>\n        start_time &#061; time.time()<br \/>\n        outputs &#061; distilled_model.generate(<br \/>\n            input_ids&#061;input_ids,<br \/>\n            max_new_tokens&#061;512,<br \/>\n            temperature&#061;0.1,<br \/>\n            top_p&#061;0.95,<br \/>\n            do_sample&#061;True,<br \/>\n            use_cache&#061;True,<br \/>\n        )<br \/>\n        end_time &#061; time.time()<\/p>\n<p>        # \u89e3\u6790\u8f93\u51fa<br \/>\n        output_text &#061; tokenizer.decode(outputs[0], skip_special_tokens&#061;True)<br \/>\n        output_text &#061; output_text.split(&#034;&lt;|im_end|&gt;&#034;)[-2].strip()<\/p>\n<p>        # \u7edf\u8ba1<br \/>\n        elapsed_time &#061; end_time &#8211; start_time<br \/>\n        num_tokens &#061; len(outputs[0]) &#8211; len(input_ids[0])<br \/>\n        total_time &#043;&#061; elapsed_time<br \/>\n        total_tokens &#043;&#061; num_tokens<\/p>\n<p>        print(f&#034;\\\\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;{prompt[:50]}&#8230; &#061;&#061;&#061;&#034;)<br \/>\n        print(f&#034;\u8017\u65f6&#xff1a;{elapsed_time:.2f}\u79d2&#034;)<br \/>\n        print(f&#034;\u751f\u6210Token\u6570&#xff1a;{num_tokens}&#034;)<br \/>\n        print(f&#034;\u901f\u5ea6&#xff1a;{num_tokens\/elapsed_time:.2f} tokens\/\u79d2&#034;)<br \/>\n        print(f&#034;\u8f93\u51fa&#xff1a;{output_text[:200]}&#8230;&#034;)<\/p>\n<p>    # \u6c47\u603b<br \/>\n    avg_speed &#061; total_tokens \/ total_time<br \/>\n    print(f&#034;\\\\n\u84b8\u998f\u6a21\u578b\u5e73\u5747\u901f\u5ea6&#xff1a;{avg_speed:.2f} tokens\/\u79d2&#034;)<\/p>\n<p>    # \u663e\u5b58\u4f7f\u7528<br \/>\n    if torch.cuda.is_available():<br \/>\n        mem_used &#061; torch.cuda.max_memory_allocated() \/ 1024 \/ 1024 \/ 1024<br \/>\n        print(f&#034;\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;{mem_used:.2f} GB&#034;)<\/p>\n<p>if __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    print(&#034;&#061;&#061;&#061; \u5f00\u59cb\u6a21\u578b\u84b8\u998f &#061;&#061;&#061;&#034;)<br \/>\n    run_distillation()<br \/>\n    print(&#034;\\\\n&#061;&#061;&#061; \u6d4b\u8bd5\u84b8\u998f\u540e\u6a21\u578b &#061;&#061;&#061;&#034;)<br \/>\n    test_distilled_model()<\/p>\n<h5>5.3.2 \u6548\u679c\u9a8c\u8bc1\u8f93\u51fa\u793a\u4f8b<\/h5>\n<p>&#061;&#061;&#061; \u52a0\u8f7d\u6559\u5e08\u6a21\u578b &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u52a0\u8f7d\u5b66\u751f\u6a21\u578b &#061;&#061;&#061;<br \/>\ntrainable params: 18,874,368 || all params: 1,807,677,440 || trainable%: 1.04<\/p>\n<p>&#061;&#061;&#061; \u5f00\u59cb\u6a21\u578b\u84b8\u998f &#061;&#061;&#061;<br \/>\nEpoch 1\/3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 100\/100 [1:20:12&lt;00:00, 48.12s\/it]<br \/>\nEpoch 2\/3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 100\/100 [1:18:45&lt;00:00, 47.25s\/it]<br \/>\nEpoch 3\/3: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 100\/100 [1:19:23&lt;00:00, 47.63s\/it]<br \/>\n\u84b8\u998f\u8bad\u7ec3\u8017\u65f6&#xff1a;23820.45\u79d2<br \/>\n\u84b8\u998f\u6a21\u578b\u4fdd\u5b58\u81f3&#xff1a;.\/models\/Qwen2.5-1.8B-Distilled<\/p>\n<p>&#061;&#061;&#061; \u6d4b\u8bd5\u84b8\u998f\u540e\u6a21\u578b &#061;&#061;&#061;<br \/>\n&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u6280\u672f\u6709\u54ea\u4e9b&#xff1f;&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;1.28\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;312<br \/>\n\u901f\u5ea6&#xff1a;243.75 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u7684\u6838\u5fc3\u6280\u672f\u4e3b\u8981\u5305\u62ec\u4e09\u7c7b&#xff1a;1. KV Cache\u4f18\u5316&#xff1a;\u7f13\u5b58\u6ce8\u610f\u529b\u673a\u5236\u7684Key\/Value\u77e9\u9635&#xff0c;\u907f\u514d\u91cd\u590d\u8ba1\u7b97&#xff0c;\u63d0\u5347\u957f\u6587\u672c\u751f\u6210\u901f\u5ea6&#xff1b;2. \u91cf\u5316\u6280\u672f&#xff1a;\u5c06\u6a21\u578b\u6743\u91cd\u4eceFP16\u964d\u81f3INT8\/INT4&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;KV Cache\u548c\u91cf\u5316\u6280\u672f\u7684\u533a\u522b\u662f\u4ec0\u4e48&#xff1f;&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;1.15\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;289<br \/>\n\u901f\u5ea6&#xff1a;251.30 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;KV Cache\u548c\u91cf\u5316\u6280\u672f\u7684\u6838\u5fc3\u533a\u522b\u5728\u4e8e\u4f18\u5316\u7ef4\u5ea6\u4e0d\u540c&#xff1a;KV Cache\u662f\u4ece\u8ba1\u7b97\u91cf\u89d2\u5ea6\u4f18\u5316&#xff0c;\u901a\u8fc7\u7f13\u5b58\u907f\u514d\u91cd\u590d\u8ba1\u7b97&#xff1b;\u91cf\u5316\u6280\u672f\u662f\u4ece\u5b58\u50a8\u89d2\u5ea6\u4f18\u5316&#8230;<\/p>\n<p>&#061;&#061;&#061; \u8f93\u5165&#xff1a;\u5982\u4f55\u5728\u8fb9\u7f18\u8bbe\u5907\u90e8\u7f72\u5927\u6a21\u578b&#xff1f;&#8230; &#061;&#061;&#061;<br \/>\n\u8017\u65f6&#xff1a;1.08\u79d2<br \/>\n\u751f\u6210Token\u6570&#xff1a;276<br \/>\n\u901f\u5ea6&#xff1a;255.56 tokens\/\u79d2<br \/>\n\u8f93\u51fa&#xff1a;\u8fb9\u7f18\u8bbe\u5907\u90e8\u7f72\u5927\u6a21\u578b\u7684\u6838\u5fc3\u7b56\u7565\u662f\u6a21\u578b\u84b8\u998f&#043;\u91cf\u5316&#xff1a;\u9996\u5148\u5c067B\/14B\u5927\u6a21\u578b\u84b8\u998f\u4e3a1.8B\/3B\u5c0f\u6a21\u578b&#xff0c;\u518d\u8fdb\u884cINT4\u91cf\u5316&#xff0c;\u7ed3\u5408KV Cache\u4f18\u5316&#8230;<\/p>\n<p>\u84b8\u998f\u6a21\u578b\u5e73\u5747\u901f\u5ea6&#xff1a;250.17 tokens\/\u79d2<br \/>\n\u6700\u5927\u663e\u5b58\u5360\u7528&#xff1a;2.45 GB<\/p>\n<h3>\u516d\u3001\u7efc\u5408\u4f18\u5316\u4e0e\u751f\u4ea7\u7ea7\u90e8\u7f72<\/h3>\n<h4>6.1 \u7ec4\u5408\u4f18\u5316\u65b9\u6848<\/h4>\n<table>\n<tr>\u573a\u666f\u4f18\u5316\u7ec4\u5408\u9884\u671f\u6548\u679c\u793a\u4f8b\u914d\u7f6e<\/tr>\n<tbody>\n<tr>\n<td>\u672c\u5730\u684c\u9762\u90e8\u7f72<\/td>\n<td>\u84b8\u998f\u6a21\u578b&#xff08;1.8B&#xff09; &#043; INT4 GPTQ &#043; KV Cache<\/td>\n<td>\u901f\u5ea6&#xff1a;200&#043; tokens\/\u79d2&#xff0c;\u663e\u5b58&#xff1a;&lt;3GB<\/td>\n<td>Qwen2.5-1.8B-Distilled &#043; GPTQ 4bit &#043; vLLM KV Cache<\/td>\n<\/tr>\n<tr>\n<td>\u4e91\u7aefAPI\u670d\u52a1<\/td>\n<td>7B\u6a21\u578b &#043; INT8\u91cf\u5316 &#043; KV Cache &#043; \u6279\u91cf\u63a8\u7406<\/td>\n<td>\u901f\u5ea6&#xff1a;80&#043; tokens\/\u79d2&#xff0c;\u663e\u5b58&#xff1a;&lt;8GB&#xff0c;\u5e76\u53d1&#xff1a;32&#043;<\/td>\n<td>Qwen2.5-7B &#043; bitsandbytes INT8 &#043; vLLM<\/td>\n<\/tr>\n<tr>\n<td>\u8fb9\u7f18\u8bbe\u5907&#xff08;\u5982NVIDIA Jetson&#xff09;<\/td>\n<td>\u84b8\u998f\u6a21\u578b&#xff08;1.8B&#xff09; &#043; INT4 AWQ &#043; \u8f7b\u91cf\u5316KV Cache<\/td>\n<td>\u901f\u5ea6&#xff1a;50&#043; tokens\/\u79d2&#xff0c;\u663e\u5b58&#xff1a;&lt;2GB<\/td>\n<td>Qwen2.5-1.8B-Distilled &#043; AWQ 4bit<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>6.2 \u751f\u4ea7\u7ea7\u90e8\u7f72\u4ee3\u7801&#xff08;FastAPI &#043; vLLM&#xff09;<\/h4>\n<h5>\u751f\u4ea7\u7ea7\u90e8\u7f72\u6d41\u7a0b\u56fe (FastAPI &#043; vLLM Serving Architecture)<\/h5>\n<p>\u2502<br \/>\n\u251c\u2500\u2500 \u3010\u670d\u52a1\u542f\u52a8\u5165\u53e3\u3011: <span class=\"token keyword\">if<\/span> __name__ <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;__main__&#034;<\/span><span class=\"token builtin class-name\">:<\/span> uvicorn.run<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">)<\/span><br \/>\n\u2502<br \/>\n\u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span>. \u5168\u5c40\u521d\u59cb\u5316\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Service Initialization<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u57fa\u7840\u8bbe\u65bd\u51c6\u5907                                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u52a0\u8f7d\u914d\u7f6e<span class=\"token operator\">&gt;<\/span>: .env <span class=\"token punctuation\">(<\/span>PORT, MODEL_PATH, GPU_MEM_UTIL<span class=\"token punctuation\">)<\/span>       \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u65e5\u5fd7\u7cfb\u7edf<span class=\"token operator\">&gt;<\/span>: logging.basicConfig <span class=\"token punctuation\">(<\/span>\u76d1\u63a7\u670d\u52a1\u72b6\u6001<span class=\"token punctuation\">)<\/span>          \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. Web \u6846\u67b6\u6784\u5efa <span class=\"token punctuation\">(<\/span>FastAPI Setup<span class=\"token punctuation\">)<\/span>                             \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u4e2d\u95f4\u4ef6<span class=\"token operator\">&gt;<\/span>: CORSMiddleware <span class=\"token punctuation\">(<\/span>\u5141\u8bb8\u8de8\u57df\u8bbf\u95ee<span class=\"token punctuation\">)<\/span>                 \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8def\u7531<span class=\"token operator\">&gt;<\/span>: \/health <span class=\"token punctuation\">(<\/span>\u5fc3\u8df3\u68c0\u6d4b<span class=\"token punctuation\">)<\/span>, \/chat <span class=\"token punctuation\">(<\/span>\u6838\u5fc3\u4e1a\u52a1<span class=\"token punctuation\">)<\/span>            \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 C. \u63a8\u7406\u5f15\u64ce\u52a0\u8f7d <span class=\"token punctuation\">(<\/span>vLLM Engine Loading<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&lt;<\/span>\u2605 \u8017\u65f6\u64cd\u4f5c<span class=\"token operator\">&gt;<\/span>          \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6a21\u578b\u8def\u5f84<span class=\"token operator\">&gt;<\/span>: <span class=\"token string\">&#034;Qwen2.5-7B-Instruct-GPTQ-4bit&#034;<\/span>            \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u663e\u5b58\u7b56\u7565<span class=\"token operator\">&gt;<\/span>: <span class=\"token assign-left variable\">gpu_memory_utilization<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.9<\/span>                 \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u5e76\u884c\u7b56\u7565<span class=\"token operator\">&gt;<\/span>: <span class=\"token assign-left variable\">tensor_parallel_size<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span>                     \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u91cf\u5316\u914d\u7f6e<span class=\"token operator\">&gt;<\/span>: <span class=\"token assign-left variable\">quantization<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;gptq&#034;<\/span> <span class=\"token punctuation\">(<\/span>\u6838\u5fc3\u52a0\u901f<span class=\"token punctuation\">)<\/span>              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u7ed3\u679c: llm \u5bf9\u8c61\u5e38\u9a7b\u663e\u5b58, PagedAttention \u5c31\u7eea           \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u670d\u52a1\u72b6\u6001: Listening on <span class=\"token number\">0.0<\/span>.0.0:8000 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n         \u2502 <span class=\"token operator\">&lt;<\/span>HTTP POST \/chat<span class=\"token operator\">&gt;<\/span><br \/>\n         \u2502 Payload: <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;messages&#034;<\/span><span class=\"token builtin class-name\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">]<\/span>, <span class=\"token string\">&#034;temperature&#034;<\/span><span class=\"token builtin class-name\">:<\/span> <span class=\"token number\">0.1<\/span><span class=\"token punctuation\">}<\/span><br \/>\n         \u2502<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">2<\/span>. \u8bf7\u6c42\u5904\u7406\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Request Handling Pipeline<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u6570\u636e\u6821\u9a8c <span class=\"token punctuation\">(<\/span>Pydantic Validation<span class=\"token punctuation\">)<\/span>                           \u2502<br \/>\n\u2502   \u251c\u2500\u2500 class ChatRequest: \u68c0\u67e5 messages \u683c\u5f0f, \u9ed8\u8ba4\u53c2\u6570\u586b\u5145      \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u5f02\u5e38\u5904\u7406: \u82e5\u6821\u9a8c\u5931\u8d25 &#8211;<span class=\"token operator\">&gt;<\/span> <span class=\"token number\">422<\/span> Unprocessable Entity      \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. Prompt \u5de5\u7a0b <span class=\"token punctuation\">(<\/span>Prompt Engineering<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u904d\u5386<span class=\"token operator\">&gt;<\/span>: request.messages                               \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u62fc\u63a5<span class=\"token operator\">&gt;<\/span>: <span class=\"token operator\">&lt;<\/span><span class=\"token operator\">|<\/span>im_start<span class=\"token operator\">|<\/span><span class=\"token operator\">&gt;<\/span>user<span class=\"token punctuation\">\\\\<\/span>n<span class=\"token punctuation\">..<\/span>.<span class=\"token operator\">&lt;<\/span><span class=\"token operator\">|<\/span>im_end<span class=\"token operator\">|<\/span><span class=\"token operator\">&gt;<\/span><span class=\"token punctuation\">\\\\<\/span>n <span class=\"token punctuation\">(<\/span>ChatML<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 C. \u53c2\u6570\u9002\u914d <span class=\"token punctuation\">(<\/span>Sampling Configuration<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u6784\u5efa<span class=\"token operator\">&gt;<\/span>: SamplingParams <span class=\"token punctuation\">(<\/span>top_p<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.95<\/span>, <span class=\"token assign-left variable\">max_tokens<\/span><span class=\"token operator\">&#061;<\/span><span class=\"token number\">2048<\/span><span class=\"token punctuation\">)<\/span>   \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u51c6\u5907\u5c31\u7eea: Prompt String &#043; SamplingParams \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">3<\/span>. \u5f02\u6b65\u63a8\u7406\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Async Inference Execution<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 <span class=\"token operator\">&lt;<\/span>\u8c03\u7528<span class=\"token operator\">&gt;<\/span>: llm.generate <span class=\"token punctuation\">(<\/span>\u975e\u963b\u585e\/\u963b\u585e\u53d6\u51b3\u4e8e vLLM \u7248\u672c\u5b9e\u73b0<span class=\"token punctuation\">)<\/span>       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 \u5185\u90e8\u8c03\u5ea6 <span class=\"token punctuation\">(<\/span>vLLM Scheduler<span class=\"token punctuation\">)<\/span>:                                  \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">1<\/span>. \u52a0\u5165\u7b49\u5f85\u961f\u5217 <span class=\"token punctuation\">(<\/span>Waiting Queue<span class=\"token punctuation\">)<\/span>                        \u2502<br \/>\n\u2502   \u251c\u2500\u2500 <span class=\"token number\">2<\/span>. \u6279\u5904\u7406\u8c03\u5ea6 <span class=\"token punctuation\">(<\/span>Continuous Batching<span class=\"token punctuation\">)<\/span>                    \u2502<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 \u5c06\u8be5\u8bf7\u6c42\u4e0e\u5176\u4ed6\u5e76\u53d1\u8bf7\u6c42\u5408\u5e76\u8fdb\u540c\u4e00\u4e2a Batch             \u2502<br \/>\n\u2502   \u2514\u2500\u2500 <span class=\"token number\">3<\/span>. PagedAttention \u663e\u5b58\u5206\u914d                             \u2502<br \/>\n\u2502       \u2514\u2500\u2500 \u52a8\u6001\u6620\u5c04 KV Cache \u5230\u7269\u7406\u663e\u5b58\u5757                      \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 \u8ba1\u7b97\u8fc7\u7a0b <span class=\"token punctuation\">(<\/span>Execution<span class=\"token punctuation\">)<\/span>:                                       \u2502<br \/>\n\u2502   \u2514\u2500\u2500 GPU Kernel \u8fd0\u7b97 <span class=\"token punctuation\">(<\/span>GPTQ INT4 GEMM<span class=\"token punctuation\">)<\/span>                       \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8f93\u51fa\u7ed3\u679c: RequestOutput <span class=\"token punctuation\">(<\/span>\u5305\u542b generated_text<span class=\"token punctuation\">)<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n         \u2502<br \/>\n         \u25bc<br \/>\n<span class=\"token punctuation\">[<\/span><span class=\"token number\">4<\/span>. \u54cd\u5e94\u4e0e\u76d1\u63a7\u9636\u6bb5 <span class=\"token punctuation\">(<\/span>Response <span class=\"token operator\">&amp;<\/span> Monitoring<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 A. \u7ed3\u679c\u89e3\u6790                                                 \u2502<br \/>\n\u2502   \u251c\u2500\u2500 \u63d0\u53d6\u6587\u672c: outputs<span class=\"token punctuation\">[<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">]<\/span>.text                              \u2502<br \/>\n\u2502   \u2514\u2500\u2500 \u7edf\u8ba1 Token: prompt_tokens, completion_tokens           \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 B. \u65e5\u5fd7\u8bb0\u5f55 <span class=\"token punctuation\">(<\/span>Observability<span class=\"token punctuation\">)<\/span>                                 \u2502<br \/>\n\u2502   \u2514\u2500\u2500 logger.info: \u8bb0\u5f55 SessionID \u53ca\u8f93\u5165\u8f93\u51fa\u957f\u5ea6 <span class=\"token punctuation\">(<\/span>\u7528\u4e8e\u8ba1\u8d39\/\u5206\u6790<span class=\"token punctuation\">)<\/span>\u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u251c\u2500\u2500 C. \u54cd\u5e94\u6784\u9020                                                 \u2502<br \/>\n\u2502   \u2514\u2500\u2500 class ChatResponse: \u5c01\u88c5\u6807\u51c6 JSON \u7ed3\u6784 <span class=\"token punctuation\">(<\/span>code, data<span class=\"token punctuation\">)<\/span>     \u2502<br \/>\n\u2502                                                              \u2502<br \/>\n\u2514\u2500\u2500 <span class=\"token operator\">&gt;<\/span> \u8fd4\u56de HTTP <span class=\"token number\">200<\/span> OK: <span class=\"token punctuation\">{<\/span><span class=\"token string\">&#034;code&#034;<\/span><span class=\"token builtin class-name\">:<\/span> <span class=\"token number\">200<\/span>, <span class=\"token string\">&#034;data&#034;<\/span><span class=\"token builtin class-name\">:<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token punctuation\">..<\/span>.<span class=\"token punctuation\">}<\/span><span class=\"token punctuation\">}<\/span> \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<\/p>\n<h6>\u6d41\u7a0b\u56fe\u6838\u5fc3\u903b\u8f91\u89e3\u6790<\/h6>\n<li>\u5168\u5c40\u5355\u4f8b (Global Singleton)&#xff1a;\u6d41\u7a0b\u56fe\u5f3a\u8c03\u4e86 llm &#061; LLM(&#8230;) \u662f\u5728\u670d\u52a1\u542f\u52a8\u65f6\u521d\u59cb\u5316\u7684&#xff0c;\u800c\u4e0d\u662f\u5728\u6bcf\u6b21\u8bf7\u6c42\u65f6\u3002\u8fd9\u662f\u751f\u4ea7\u73af\u5883\u7684\u5173\u952e&#xff0c;\u907f\u514d\u4e86\u91cd\u590d\u52a0\u8f7d\u6a21\u578b\u5e26\u6765\u7684\u5de8\u5927\u5ef6\u8fdf\u3002<\/li>\n<li>\u8c03\u5ea6\u5c42 (Scheduler)&#xff1a;\u5728\u201c[3. \u5f02\u6b65\u63a8\u7406\u9636\u6bb5]\u201d\u4e2d&#xff0c;\u6211\u8865\u5145\u4e86\u4ee3\u7801\u4e2d\u770b\u4e0d\u89c1\u4f46\u5728 vLLM \u5185\u90e8\u53d1\u751f\u7684\u201cContinuous Batching\u201d\u548c\u201cPagedAttention\u201d\u903b\u8f91&#xff0c;\u8fd9\u662f\u8be5\u670d\u52a1\u4e4b\u6240\u4ee5\u80fd\u9ad8\u5e76\u53d1\u7684\u6839\u672c\u539f\u56e0\u3002<\/li>\n<li>Prompt \u62fc\u63a5&#xff1a;\u660e\u786e\u5c55\u793a\u4e86\u5c06 JSON \u683c\u5f0f\u7684 messages \u8f6c\u6362\u4e3a\u6a21\u578b\u80fd\u7406\u89e3\u7684 ChatML \u5b57\u7b26\u4e32\u7684\u8fc7\u7a0b\u3002<\/li>\n<p>import os<br \/>\nimport uvicorn<br \/>\nfrom fastapi import FastAPI, Request, HTTPException<br \/>\nfrom fastapi.middleware.cors import CORSMiddleware<br \/>\nfrom pydantic import BaseModel<br \/>\nfrom vllm import LLM, SamplingParams<br \/>\nfrom typing import Optional, List<br \/>\nimport logging<br \/>\nfrom dotenv import load_dotenv<\/p>\n<p># \u52a0\u8f7d\u73af\u5883\u53d8\u91cf<br \/>\nload_dotenv()<\/p>\n<p># \u914d\u7f6e\u65e5\u5fd7<br \/>\nlogging.basicConfig(<br \/>\n    level&#061;logging.INFO,<br \/>\n    format&#061;&#034;%(asctime)s &#8211; %(levelname)s &#8211; %(message)s&#034;,<br \/>\n    handlers&#061;[logging.StreamHandler()]<br \/>\n)<br \/>\nlogger &#061; logging.getLogger(__name__)<\/p>\n<p># \u521d\u59cb\u5316FastAPI<br \/>\napp &#061; FastAPI(title&#061;&#034;LLM\u63a8\u7406\u52a0\u901f\u670d\u52a1&#034;, version&#061;&#034;1.0&#034;)<\/p>\n<p># \u914d\u7f6eCORS<br \/>\napp.add_middleware(<br \/>\n    CORSMiddleware,<br \/>\n    allow_origins&#061;[&#034;*&#034;],<br \/>\n    allow_credentials&#061;True,<br \/>\n    allow_methods&#061;[&#034;*&#034;],<br \/>\n    allow_headers&#061;[&#034;*&#034;],<br \/>\n)<\/p>\n<p># \u5168\u5c40\u914d\u7f6e<br \/>\nMODEL_PATH &#061; os.getenv(&#034;MODEL_PATH&#034;, &#034;.\/models\/Qwen2.5-7B-Instruct-GPTQ-4bit&#034;)<br \/>\nPORT &#061; int(os.getenv(&#034;PORT&#034;, 8000))<br \/>\nGPU_MEM_UTIL &#061; float(os.getenv(&#034;GPU_MEM_UTIL&#034;, 0.9))<\/p>\n<p># \u521d\u59cb\u5316vLLM&#xff08;\u96c6\u6210KV Cache &#043; \u91cf\u5316&#xff09;<br \/>\nlogger.info(f&#034;\u5f00\u59cb\u52a0\u8f7d\u6a21\u578b&#xff1a;{MODEL_PATH}&#034;)<br \/>\nllm &#061; LLM(<br \/>\n    model&#061;MODEL_PATH,<br \/>\n    tensor_parallel_size&#061;1,<br \/>\n    gpu_memory_utilization&#061;GPU_MEM_UTIL,<br \/>\n    max_num_batched_tokens&#061;4096,<br \/>\n    max_num_seqs&#061;64,<br \/>\n    enable_chunked_prefill&#061;True,<br \/>\n    chunked_prefill_tokens&#061;512,<br \/>\n    # \u91cf\u5316\u6a21\u578b\u914d\u7f6e<br \/>\n    quantization&#061;&#034;gptq&#034;,  # \u9002\u914dGPTQ\u91cf\u5316\u6a21\u578b<br \/>\n    trust_remote_code&#061;True,<br \/>\n)<br \/>\nsampling_params &#061; SamplingParams(<br \/>\n    temperature&#061;0.1,<br \/>\n    max_tokens&#061;2048,<br \/>\n    top_p&#061;0.95,<br \/>\n    use_beam_search&#061;False,<br \/>\n)<br \/>\nlogger.info(&#034;\u6a21\u578b\u52a0\u8f7d\u5b8c\u6210&#034;)<\/p>\n<p># \u8bf7\u6c42\u6a21\u578b<br \/>\nclass ChatRequest(BaseModel):<br \/>\n    messages: List[dict]  # [{&#034;role&#034;: &#034;user&#034;, &#034;content&#034;: &#034;&#8230;&#034;}]<br \/>\n    temperature: Optional[float] &#061; 0.1<br \/>\n    max_tokens: Optional[int] &#061; 2048<br \/>\n    session_id: Optional[str] &#061; &#034;default&#034;<\/p>\n<p># \u54cd\u5e94\u6a21\u578b<br \/>\nclass ChatResponse(BaseModel):<br \/>\n    code: int &#061; 200<br \/>\n    message: str &#061; &#034;success&#034;<br \/>\n    data: dict &#061; {}<\/p>\n<p># \u5065\u5eb7\u68c0\u67e5\u63a5\u53e3<br \/>\n&#064;app.get(&#034;\/health&#034;, response_model&#061;ChatResponse)<br \/>\nasync def health_check():<br \/>\n    return ChatResponse(<br \/>\n        data&#061;{<br \/>\n            &#034;status&#034;: &#034;healthy&#034;,<br \/>\n            &#034;model&#034;: MODEL_PATH,<br \/>\n            &#034;gpu_memory_utilization&#034;: GPU_MEM_UTIL<br \/>\n        }<br \/>\n    )<\/p>\n<p># \u804a\u5929\u63a5\u53e3<br \/>\n&#064;app.post(&#034;\/chat&#034;, response_model&#061;ChatResponse)<br \/>\nasync def chat(request: ChatRequest):<br \/>\n    try:<br \/>\n        # \u9a8c\u8bc1\u8f93\u5165<br \/>\n        if not request.messages or not isinstance(request.messages, list):<br \/>\n            raise HTTPException(status_code&#061;400, detail&#061;&#034;messages\u4e0d\u80fd\u4e3a\u7a7a&#034;)<\/p>\n<p>        # \u6784\u9020prompt<br \/>\n        prompt &#061; &#034;&#034;<br \/>\n        for msg in request.messages:<br \/>\n            if msg[&#034;role&#034;] &#061;&#061; &#034;user&#034;:<br \/>\n                prompt &#043;&#061; f&#034;&lt;|im_start|&gt;user\\\\n{msg[&#039;content&#039;]}&lt;|im_end|&gt;\\\\n&#034;<br \/>\n            elif msg[&#034;role&#034;] &#061;&#061; &#034;assistant&#034;:<br \/>\n                prompt &#043;&#061; f&#034;&lt;|im_start|&gt;assistant\\\\n{msg[&#039;content&#039;]}&lt;|im_end|&gt;\\\\n&#034;<br \/>\n        prompt &#043;&#061; &#034;&lt;|im_start|&gt;assistant\\\\n&#034;<\/p>\n<p>        # \u8c03\u6574\u91c7\u6837\u53c2\u6570<br \/>\n        custom_sampling_params &#061; SamplingParams(<br \/>\n            temperature&#061;request.temperature,<br \/>\n            max_tokens&#061;request.max_tokens,<br \/>\n            top_p&#061;0.95,<br \/>\n        )<\/p>\n<p>        # \u63a8\u7406<br \/>\n        start_time &#061; logger.info(f&#034;\u4f1a\u8bdd[{request.session_id}]\u5f00\u59cb\u63a8\u7406&#034;)<br \/>\n        outputs &#061; llm.generate([prompt], custom_sampling_params)<br \/>\n        end_time &#061; logger.info(f&#034;\u4f1a\u8bdd[{request.session_id}]\u63a8\u7406\u5b8c\u6210&#034;)<\/p>\n<p>        # \u89e3\u6790\u8f93\u51fa<br \/>\n        output &#061; outputs[0].outputs[0].text<br \/>\n        prompt_tokens &#061; len(outputs[0].prompt_token_ids)<br \/>\n        completion_tokens &#061; len(outputs[0].output_token_ids)<\/p>\n<p>        # \u65e5\u5fd7<br \/>\n        logger.info(<br \/>\n            f&#034;\u4f1a\u8bdd[{request.session_id}] &#8211; &#034;<br \/>\n            f&#034;\u8f93\u5165Token&#xff1a;{prompt_tokens}&#xff0c;\u8f93\u51faToken&#xff1a;{completion_tokens}&#034;<br \/>\n        )<\/p>\n<p>        return ChatResponse(<br \/>\n            data&#061;{<br \/>\n                &#034;response&#034;: output,<br \/>\n                &#034;prompt_tokens&#034;: prompt_tokens,<br \/>\n                &#034;completion_tokens&#034;: completion_tokens,<br \/>\n                &#034;session_id&#034;: request.session_id<br \/>\n            }<br \/>\n        )<br \/>\n    except HTTPException as e:<br \/>\n        logger.error(f&#034;\u5ba2\u6237\u7aef\u9519\u8bef&#xff1a;{e.detail}&#034;)<br \/>\n        return ChatResponse(<br \/>\n            code&#061;e.status_code,<br \/>\n            message&#061;e.detail,<br \/>\n            data&#061;{&#034;session_id&#034;: request.session_id}<br \/>\n        )<br \/>\n    except Exception as e:<br \/>\n        logger.error(f&#034;\u670d\u52a1\u7aef\u9519\u8bef&#xff1a;{str(e)}&#034;, exc_info&#061;True)<br \/>\n        return ChatResponse(<br \/>\n            code&#061;500,<br \/>\n            message&#061;f&#034;\u670d\u52a1\u7aef\u9519\u8bef&#xff1a;{str(e)}&#034;,<br \/>\n            data&#061;{&#034;session_id&#034;: request.session_id}<br \/>\n        )<\/p>\n<p># \u542f\u52a8\u670d\u52a1<br \/>\nif __name__ &#061;&#061; &#034;__main__&#034;:<br \/>\n    uvicorn.run(<br \/>\n        app&#061;&#034;inference_service:app&#034;,<br \/>\n        host&#061;&#034;0.0.0.0&#034;,<br \/>\n        port&#061;PORT,<br \/>\n        reload&#061;False,<br \/>\n        workers&#061;1<br \/>\n    )<\/p>\n<h4>6.3 \u90e8\u7f72\u542f\u52a8\u811a\u672c<\/h4>\n<p>#!\/bin\/bash<br \/>\n# start_service.sh<br \/>\nexport MODEL_PATH&#061;&#034;.\/models\/Qwen2.5-7B-Instruct-GPTQ-4bit&#034;<br \/>\nexport PORT&#061;8000<br \/>\nexport GPU_MEM_UTIL&#061;0.9<\/p>\n<p># \u542f\u52a8\u670d\u52a1<br \/>\npython inference_service.py<\/p>\n<h3>\u4e03\u3001\u5e38\u89c1\u95ee\u9898\u4e0e\u89e3\u51b3\u65b9\u6848<\/h3>\n<h4>7.1 KV Cache \u76f8\u5173\u95ee\u9898<\/h4>\n<table>\n<tr>\u95ee\u9898\u89e3\u51b3\u65b9\u6848<\/tr>\n<tbody>\n<tr>\n<td>\u663e\u5b58\u788e\u7247\u5bfc\u81f4OOM<\/td>\n<td>\u542f\u7528vLLM\u7684PagedAttention&#xff0c;\u8bbe\u7f6eswap_space<\/td>\n<\/tr>\n<tr>\n<td>\u957f\u6587\u672c\u5bf9\u8bdd\u901f\u5ea6\u4e0b\u964d<\/td>\n<td>\u589e\u5927chunked_prefill_tokens&#xff0c;\u8c03\u6574max_num_seqs<\/td>\n<\/tr>\n<tr>\n<td>KV Cache\u672a\u751f\u6548<\/td>\n<td>\u786e\u8ba4model.config.use_cache&#061;True&#xff0c;vLLM\u9ed8\u8ba4\u5f00\u542f<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>7.2 \u91cf\u5316\u76f8\u5173\u95ee\u9898<\/h4>\n<table>\n<tr>\u95ee\u9898\u89e3\u51b3\u65b9\u6848<\/tr>\n<tbody>\n<tr>\n<td>INT4\u91cf\u5316\u7cbe\u5ea6\u635f\u5931\u8fc7\u5927<\/td>\n<td>\u8c03\u6574group_size&#061;64&#xff0c;\u542f\u7528desc_act&#061;True&#xff0c;\u6216\u6539\u7528INT8<\/td>\n<\/tr>\n<tr>\n<td>GPTQ\u91cf\u5316\u6a21\u578b\u52a0\u8f7d\u5931\u8d25<\/td>\n<td>\u786e\u8ba4\u6a21\u578b\u6587\u4ef6\u5b8c\u6574&#xff0c;\u4f7f\u7528trust_remote_code&#061;True<\/td>\n<\/tr>\n<tr>\n<td>bitsandbytes\u91cf\u5316\u62a5\u9519<\/td>\n<td>\u5347\u7ea7bitsandbytes\u5230\u6700\u65b0\u7248&#xff0c;\u786e\u8ba4CUDA\u7248\u672c\u517c\u5bb9<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4>7.3 \u84b8\u998f\u76f8\u5173\u95ee\u9898<\/h4>\n<table>\n<tr>\u95ee\u9898\u89e3\u51b3\u65b9\u6848<\/tr>\n<tbody>\n<tr>\n<td>\u84b8\u998f\u8bad\u7ec3\u8fc7\u6162<\/td>\n<td>\u542f\u7528gradient_checkpointing&#xff0c;\u589e\u5927batch_size<\/td>\n<\/tr>\n<tr>\n<td>\u84b8\u998f\u6a21\u578b\u7cbe\u5ea6\u4f4e<\/td>\n<td>\u589e\u52a0\u8bad\u7ec3\u6570\u636e\u91cf&#xff0c;\u8c03\u6574\u84b8\u998f\u635f\u5931\u6743\u91cd&#xff08;\u589e\u5927KL\u6563\u5ea6\u6743\u91cd&#xff09;<\/td>\n<\/tr>\n<tr>\n<td>LoRA\u8bad\u7ec3\u53c2\u6570\u8fc7\u591a<\/td>\n<td>\u51cf\u5c11target_modules&#xff0c;\u964d\u4f4elora_rank<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>\u516b\u3001\u603b\u7ed3\u4e0e\u672a\u6765\u8d8b\u52bf<\/h3>\n<h4>8.1 \u6838\u5fc3\u603b\u7ed3<\/h4>\n<li>\n<p>KV Cache \u662f\u65e0\u7cbe\u5ea6\u635f\u5931\u7684\u57fa\u7840\u52a0\u901f\u6280\u672f&#xff0c;\u4f18\u5148\u5728\u6240\u6709\u573a\u666f\u542f\u7528<\/p>\n<\/li>\n<li>\n<p>\u91cf\u5316\u6280\u672f \u662f\u663e\u5b58\u4f18\u5316\u7684\u9996\u9009&#xff0c;INT8\u5e73\u8861\u7cbe\u5ea6\u4e0e\u901f\u5ea6&#xff0c;INT4\u9002\u5408\u6781\u81f4\u8f7b\u91cf\u5316<\/p>\n<\/li>\n<li>\n<p>\u6a21\u578b\u84b8\u998f \u662f\u8fb9\u7f18\u90e8\u7f72\u7684\u6838\u5fc3\u65b9\u6848&#xff0c;\u7ed3\u5408\u91cf\u5316\u53ef\u5b9e\u73b010\u500d\u4ee5\u4e0a\u52a0\u901f<\/p>\n<\/li>\n<li>\n<p>\u7ec4\u5408\u4f18\u5316 \u662f\u751f\u4ea7\u73af\u5883\u7684\u6700\u4f73\u5b9e\u8df5&#xff1a;KV Cache &#043; \u91cf\u5316&#xff08;\u901a\u7528&#xff09;\u3001\u84b8\u998f &#043; \u91cf\u5316 &#043; KV Cache&#xff08;\u6781\u81f4&#xff09;<\/p>\n<\/li>\n<h4>8.2 \u672a\u6765\u8d8b\u52bf<\/h4>\n<li>\n<p>\u786c\u4ef6\u7ea7\u4f18\u5316&#xff1a;GPU\u5382\u5546\u4e13\u7528\u63a8\u7406\u82af\u7247&#xff08;\u5982NVIDIA H100&#xff09;\u5185\u7f6eKV Cache\u52a0\u901f<\/p>\n<\/li>\n<li>\n<p>\u6df7\u5408\u7cbe\u5ea6\u63a8\u7406&#xff1a;\u52a8\u6001\u8c03\u6574\u4e0d\u540c\u5c42\u7684\u91cf\u5316\u7cbe\u5ea6&#xff08;\u5982\u6ce8\u610f\u529b\u5c42INT8&#xff0c;\u8f93\u51fa\u5c42FP16&#xff09;<\/p>\n<\/li>\n<li>\n<p>\u589e\u91cf\u84b8\u998f&#xff1a;\u9488\u5bf9\u7279\u5b9a\u9886\u57df\u7684\u8f7b\u91cf\u5316\u84b8\u998f&#xff0c;\u4fdd\u7559\u9886\u57df\u77e5\u8bc6<\/p>\n<\/li>\n<li>\n<p>\u5206\u5e03\u5f0fKV Cache&#xff1a;\u591aGPU\/\u591a\u8282\u70b9\u5171\u4eabKV Cache&#xff0c;\u63d0\u5347\u5e76\u53d1\u80fd\u529b<\/p>\n<\/li>\n<h4>8.3 \u6700\u4f73\u5b9e\u8df5\u5efa\u8bae<\/h4>\n<li>\n<p>\u5feb\u901f\u9a8c\u8bc1&#xff1a;\u5148\u7528vLLM\u90e8\u7f727B\u6a21\u578b&#043;KV Cache&#xff0c;\u9a8c\u8bc1\u57fa\u7840\u6027\u80fd<\/p>\n<\/li>\n<li>\n<p>\u663e\u5b58\u4f18\u5316&#xff1a;\u542f\u7528INT8\u91cf\u5316&#xff0c;\u663e\u5b58\u5360\u7528\u964d\u4f4e50%<\/p>\n<\/li>\n<li>\n<p>\u8fb9\u7f18\u90e8\u7f72&#xff1a;\u84b8\u998f1.8B\u6a21\u578b&#043;INT4\u91cf\u5316&#xff0c;\u663e\u5b58&lt;3GB<\/p>\n<\/li>\n<li>\n<p>\u751f\u4ea7\u76d1\u63a7&#xff1a;\u96c6\u6210LangSmith\u76d1\u63a7\u63a8\u7406\u901f\u5ea6\u3001\u663e\u5b58\u5360\u7528\u3001\u7cbe\u5ea6\u635f\u5931<\/p>\n<\/li>\n<li>\n<p>\u6301\u7eed\u8c03\u4f18&#xff1a;\u6839\u636e\u4e1a\u52a1\u573a\u666f\u8c03\u6574\u91cf\u5316\u7cbe\u5ea6\u3001KV Cache\u53c2\u6570\u3001\u84b8\u998f\u635f\u5931\u6743\u91cd<\/p>\n<\/li>\n","protected":false},"excerpt":{"rendered":"<p>\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218&#xff1a;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff08;\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801&#xff09;<br \/>\n\u6587\u6863\u6982\u8ff0<br \/>\n\u6587\u7ae0\u6838\u5fc3\u4ef7\u503c \u6df1\u5ea6\u89e3\u6790\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u4e09\u5927\u6838\u5fc3\u6280\u672f&#xff08;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff09;\u7684\u5e95\u5c42\u539f\u7406  \u57fa\u4e8e2025\u5e74\u6700\u65b0\u5f00\u6e90\u751f\u6001&#xff08;vLLM 0.5.0\u3001AutoGPTQ 0.7.1\u3001Transformers 4.41&#xff09;\u5b9e\u73b0\u4ee3\u7801\u843d\u5730  \u9488\u5bf9\u4e0d\u540c\u573a\u666f&#xff08;\u672c\u5730\u90e8\u7f72\u3001API\u670d\u52a1\u3001\u8fb9\u7f18\u8bbe\u5907<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[50,3684,51],"topic":[],"class_list":["post-70070","post","type-post","status-publish","format-standard","hentry","category-server","tag-50","tag-3684","tag-51"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/70070.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218&#xff1a;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff08;\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801&#xff09; \u6587\u6863\u6982\u8ff0 \u6587\u7ae0\u6838\u5fc3\u4ef7\u503c \u6df1\u5ea6\u89e3\u6790\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u4e09\u5927\u6838\u5fc3\u6280\u672f&#xff08;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff09;\u7684\u5e95\u5c42\u539f\u7406 \u57fa\u4e8e2025\u5e74\u6700\u65b0\u5f00\u6e90\u751f\u6001&#xff08;vLLM 0.5.0\u3001AutoGPTQ 0.7.1\u3001Transformers 4.41&#xff09;\u5b9e\u73b0\u4ee3\u7801\u843d\u5730 \u9488\u5bf9\u4e0d\u540c\u573a\u666f&#xff08;\u672c\u5730\u90e8\u7f72\u3001API\u670d\u52a1\u3001\u8fb9\u7f18\u8bbe\u5907\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/70070.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-02-01T06:17:07+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"28 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/70070.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/70070.html\",\"name\":\"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-02-01T06:17:07+00:00\",\"dateModified\":\"2026-02-01T06:17:07+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/70070.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/70070.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/70070.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/70070.html","og_locale":"zh_CN","og_type":"article","og_title":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218&#xff1a;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff08;\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801&#xff09; \u6587\u6863\u6982\u8ff0 \u6587\u7ae0\u6838\u5fc3\u4ef7\u503c \u6df1\u5ea6\u89e3\u6790\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u4e09\u5927\u6838\u5fc3\u6280\u672f&#xff08;KV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f&#xff09;\u7684\u5e95\u5c42\u539f\u7406 \u57fa\u4e8e2025\u5e74\u6700\u65b0\u5f00\u6e90\u751f\u6001&#xff08;vLLM 0.5.0\u3001AutoGPTQ 0.7.1\u3001Transformers 4.41&#xff09;\u5b9e\u73b0\u4ee3\u7801\u843d\u5730 \u9488\u5bf9\u4e0d\u540c\u573a\u666f&#xff08;\u672c\u5730\u90e8\u7f72\u3001API\u670d\u52a1\u3001\u8fb9\u7f18\u8bbe\u5907","og_url":"https:\/\/www.wsisp.com\/helps\/70070.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-02-01T06:17:07+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"28 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/70070.html","url":"https:\/\/www.wsisp.com\/helps\/70070.html","name":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-02-01T06:17:07+00:00","dateModified":"2026-02-01T06:17:07+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/70070.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/70070.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/70070.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u5927\u6a21\u578b\u63a8\u7406\u52a0\u901f\u6838\u5fc3\u6280\u672f\u5b9e\u6218\uff1aKV Cache\u3001\u91cf\u5316\u3001\u6a21\u578b\u84b8\u998f\uff08\u9644\u6700\u65b0\u5f00\u6e90\u4ee3\u7801\uff09"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/70070","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=70070"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/70070\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=70070"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=70070"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=70070"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=70070"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}