{"id":73557,"date":"2026-02-07T22:05:40","date_gmt":"2026-02-07T14:05:40","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/73557.html"},"modified":"2026-02-07T22:05:40","modified_gmt":"2026-02-07T14:05:40","slug":"glm-4-9b-chat-1m%e9%83%a8%e7%bd%b2%e6%95%99%e7%a8%8b%ef%bc%9anvidia-triton%e6%8e%a8%e7%90%86%e6%9c%8d%e5%8a%a1%e5%99%a8%e9%9b%86%e6%88%90%e6%96%b9%e6%a1%88","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/73557.html","title":{"rendered":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848"},"content":{"rendered":"<h2>GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b&#xff1a;NVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848<\/h2>\n<h3>1. \u4e3a\u4ec0\u4e48\u9700\u8981\u5728Triton\u4e2d\u90e8\u7f72GLM-4-9B-Chat-1M<\/h3>\n<p>\u4f60\u6709\u6ca1\u6709\u9047\u5230\u8fc7\u8fd9\u6837\u7684\u573a\u666f&#xff1a;\u624b\u5934\u53ea\u6709\u4e00\u5f20RTX 4090&#xff0c;\u5374\u8981\u5904\u7406\u4e00\u4efd300\u9875\u7684\u4e0a\u5e02\u516c\u53f8\u8d22\u62a5\u3001\u4e00\u4efd\u5e26\u9644\u5f55\u7684\u8de8\u5883\u5408\u540c&#xff0c;\u6216\u8005\u4e00\u672c20\u4e07\u5b57\u7684\u6280\u672f\u767d\u76ae\u4e66&#xff1f;\u4f20\u7edf8B\u7ea7\u6a21\u578b\u4e00\u78b0\u4e0a\u8d85\u957f\u6587\u672c\u5c31\u5361\u987f\u3001OOM\u3001\u4e22\u4e0a\u4e0b\u6587\u2014\u2014\u8981\u4e48\u5207\u5206\u540e\u4fe1\u606f\u65ad\u88c2&#xff0c;\u8981\u4e48\u7b49\u534a\u5929\u624d\u5410\u51fa\u4e00\u53e5\u603b\u7ed3\u3002<\/p>\n<p>GLM-4-9B-Chat-1M\u5c31\u662f\u4e3a\u8fd9\u79cd\u771f\u5b9e\u9700\u6c42\u800c\u751f\u7684\u3002\u5b83\u4e0d\u662f\u53c2\u6570\u5806\u51fa\u6765\u7684\u201c\u7eb8\u9762\u738b\u8005\u201d&#xff0c;\u800c\u662f\u5b9e\u6253\u5b9e\u80fd\u5728\u5355\u5361\u4e0a\u8dd1\u901a100\u4e07token\u7684\u5bf9\u8bdd\u6a21\u578b&#xff1a;200\u4e07\u6c49\u5b57\u4e00\u6b21\u8f7d\u5165&#xff0c;needle-in-haystack\u6d4b\u8bd5\u5728\u6ee1\u957f\u5ea6\u4e0b\u51c6\u786e\u7387100%&#xff0c;LongBench-Chat\u5f97\u52067.82&#xff0c;\u8fd8\u5b8c\u6574\u4fdd\u7559Function Call\u3001\u4ee3\u7801\u6267\u884c\u548c\u591a\u8f6e\u5bf9\u8bdd\u80fd\u529b\u3002\u66f4\u5173\u952e\u7684\u662f&#xff0c;\u5b83\u7684INT4\u91cf\u5316\u7248\u672c\u4ec5\u97009GB\u663e\u5b58\u2014\u2014\u8fd9\u610f\u5473\u7740\u4f60\u4e0d\u7528\u4e70A100&#xff0c;\u4e0d\u7528\u642d\u5206\u5e03\u5f0f\u96c6\u7fa4&#xff0c;\u4e00\u5f20\u6d88\u8d39\u7ea7\u663e\u5361\u5c31\u80fd\u8dd1\u8d77\u4f01\u4e1a\u7ea7\u957f\u6587\u672c\u670d\u52a1\u3002<\/p>\n<p>\u4f46\u95ee\u9898\u6765\u4e86&#xff1a;\u5b98\u65b9\u793a\u4f8b\u591a\u57fa\u4e8evLLM\u6216Transformers\u76f4\u63a5\u542f\u52a8&#xff0c;\u9002\u5408\u5f00\u53d1\u8c03\u8bd5&#xff0c;\u5374\u96be\u4ee5\u878d\u5165\u751f\u4ea7\u73af\u5883\u3002\u771f\u5b9e\u4e1a\u52a1\u7cfb\u7edf\u9700\u8981\u7edf\u4e00API\u7f51\u5173\u3001\u81ea\u52a8\u6269\u7f29\u5bb9\u3001\u6a21\u578b\u70ed\u66f4\u65b0\u3001GPU\u8d44\u6e90\u9694\u79bb\u3001\u6307\u6807\u76d1\u63a7\u2026\u2026\u8fd9\u4e9b\u6b63\u662fNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u7684\u6838\u5fc3\u4ef7\u503c\u3002Triton\u4e0d\u53ea\u5e2e\u4f60\u201c\u8dd1\u8d77\u6765\u201d&#xff0c;\u800c\u662f\u5e2e\u4f60\u201c\u7a33\u4f4f\u201d\u201c\u7ba1\u4f4f\u201d\u201c\u6269\u5f00\u201d\u3002<\/p>\n<p>\u672c\u6559\u7a0b\u4e0d\u8bb2\u62bd\u8c61\u6982\u5ff5&#xff0c;\u4e0d\u5806\u53c2\u6570\u8868\u683c&#xff0c;\u5168\u7a0b\u805a\u7126\u4e00\u4ef6\u4e8b&#xff1a;\u5982\u4f55\u628aGLM-4-9B-Chat-1M\u771f\u6b63\u53d8\u6210\u4f60\u540e\u7aef\u670d\u52a1\u91cc\u4e00\u4e2a\u53ef\u8c03\u5ea6\u3001\u53ef\u76d1\u63a7\u3001\u53ef\u7070\u5ea6\u53d1\u5e03\u7684\u6807\u51c6HTTP\u63a5\u53e3\u3002\u4ece\u96f6\u5f00\u59cb&#xff0c;\u8986\u76d6\u73af\u5883\u51c6\u5907\u3001\u6a21\u578b\u8f6c\u6362\u3001Triton\u914d\u7f6e\u3001\u670d\u52a1\u542f\u52a8\u3001API\u8c03\u7528\u5168\u94fe\u8def&#xff0c;\u6240\u6709\u547d\u4ee4\u5747\u53ef\u590d\u5236\u7c98\u8d34\u6267\u884c\u3002<\/p>\n<h3>2. \u73af\u5883\u51c6\u5907\u4e0e\u4f9d\u8d56\u5b89\u88c5<\/h3>\n<h4>2.1 \u786c\u4ef6\u4e0e\u7cfb\u7edf\u8981\u6c42<\/h4>\n<p>Triton\u5bf9\u786c\u4ef6\u6709\u660e\u786e\u8981\u6c42&#xff0c;\u6211\u4eec\u6309\u6700\u4f4e\u53ef\u884c\u914d\u7f6e\u6765\u51c6\u5907&#xff1a;<\/p>\n<ul>\n<li>GPU&#xff1a;NVIDIA RTX 3090 \/ 4090&#xff08;24GB\u663e\u5b58&#xff09;\u6216 A10 \/ A100&#xff08;\u63a8\u8350A10&#xff0c;\u6027\u4ef7\u6bd4\u9ad8&#xff09;<\/li>\n<li>CUDA&#xff1a;12.1 \u6216 12.2&#xff08;Triton 24.06&#043;\u5df2\u5f03\u752811.x&#xff09;<\/li>\n<li>\u64cd\u4f5c\u7cfb\u7edf&#xff1a;Ubuntu 22.04 LTS&#xff08;\u5b98\u65b9\u6700\u7a33\u5b9a\u652f\u6301\u7248\u672c&#xff09;<\/li>\n<li>Docker&#xff1a;24.0.0&#043;&#xff08;Triton\u5b98\u65b9\u955c\u50cf\u57fa\u4e8eDocker\u8fd0\u884c&#xff09;<\/li>\n<\/ul>\n<p>\u6ce8\u610f&#xff1a;\u4e0d\u8981\u5728WSL\u6216Mac\u4e0a\u5c1d\u8bd5\u3002Triton\u4f9d\u8d56NVIDIA Container Toolkit&#xff0c;\u5fc5\u987b\u5728\u539f\u751fLinux\u73af\u5883\u4e2d\u8fd0\u884c\u3002<\/p>\n<h4>2.2 \u5b89\u88c5NVIDIA Container Toolkit<\/h4>\n<p># \u6dfb\u52a0\u5bc6\u94a5\u548c\u6e90<br \/>\ncurl -fsSL https:\/\/nvidia.github.io\/libnvidia-container\/gpgkey | sudo gpg &#8211;dearmor -o \/usr\/share\/keyrings\/nvidia-container-toolkit-keyring.gpg<br \/>\ncurl -fsSL https:\/\/nvidia.github.io\/libnvidia-container\/stable\/deb\/nvidia-container-toolkit.list | \\\\<br \/>\n  sed &#039;s#deb https:\/\/#deb [arch&#061;amd64 signed-by&#061;\/usr\/share\/keyrings\/nvidia-container-toolkit-keyring.gpg] https:\/\/#g&#039; | \\\\<br \/>\n  sudo tee \/etc\/apt\/sources.list.d\/nvidia-container-toolkit.list<\/p>\n<p># \u5b89\u88c5<br \/>\nsudo apt-get update<br \/>\nsudo apt-get install -y nvidia-container-toolkit<br \/>\nsudo systemctl restart docker<\/p>\n<p>\u9a8c\u8bc1\u662f\u5426\u751f\u6548&#xff1a;<\/p>\n<p>docker run &#8211;rm &#8211;gpus all nvidia\/cuda:12.2.0-base-ubuntu22.04 nvidia-smi<\/p>\n<p>\u770b\u5230GPU\u5217\u8868\u5373\u6210\u529f\u3002<\/p>\n<h4>2.3 \u62c9\u53d6Triton\u5b98\u65b9\u955c\u50cf<\/h4>\n<p>\u6211\u4eec\u4f7f\u7528Triton 24.06&#xff08;2024\u5e746\u6708\u6700\u65b0LTS\u7248&#xff09;&#xff0c;\u5df2\u9884\u88c5CUDA 12.2\u3001TensorRT 10.1\u3001PyTorch 2.3&#xff1a;<\/p>\n<p>docker pull nvcr.io\/nvidia\/tritonserver:24.06-py3<\/p>\n<p>\u5c0f\u63d0\u793a&#xff1a;\u4e0d\u8981\u7528latest\u6807\u7b7e\u3002Triton\u7248\u672c\u8fed\u4ee3\u5feb&#xff0c;24.06\u662f\u5f53\u524d\u6700\u7a33\u5b9a\u7684\u4f01\u4e1a\u7ea7\u7248\u672c&#xff0c;\u517c\u5bb9\u6027\u597d\u3001\u6587\u6863\u5168\u3001bug\u5c11\u3002<\/p>\n<h3>3. \u6a21\u578b\u83b7\u53d6\u4e0e\u683c\u5f0f\u8f6c\u6362<\/h3>\n<h4>3.1 \u4e0b\u8f7dGLM-4-9B-Chat-1M INT4\u6743\u91cd<\/h4>\n<p>\u5b98\u65b9\u63d0\u4f9bHuggingFace\u548cModelScope\u53cc\u6e90\u3002\u6211\u4eec\u4f18\u5148\u9009HuggingFace&#xff0c;\u4e0b\u8f7d\u66f4\u7a33\u5b9a&#xff1a;<\/p>\n<p># \u521b\u5efa\u6a21\u578b\u76ee\u5f55<br \/>\nmkdir -p .\/models\/glm4_9b_chat_1m\/1<\/p>\n<p># \u4f7f\u7528hf-transfer\u52a0\u901f\u4e0b\u8f7d&#xff08;\u6bd4git clone\u5feb5\u500d&#xff09;<br \/>\npip install hf-transfer<br \/>\nhuggingface-cli download ZhipuAI\/glm-4-9b-chat-1m \\\\<br \/>\n  &#8211;revision &#034;int4&#034; \\\\<br \/>\n  &#8211;include &#034;config.json&#034; \\\\<br \/>\n  &#8211;include &#034;pytorch_model.bin.index.json&#034; \\\\<br \/>\n  &#8211;include &#034;model.safetensors*&#034; \\\\<br \/>\n  &#8211;include &#034;tokenizer*&#034;<\/p>\n<p>\u4e0b\u8f7d\u5b8c\u6210\u540e&#xff0c;\u4f60\u4f1a\u5f97\u5230\u7ea69GB\u7684INT4\u6743\u91cd\u6587\u4ef6\u3002\u6ce8\u610f&#xff1a;model.safetensors.index.json\u4f1a\u6307\u5f15\u591a\u4e2a\u5206\u7247\u6587\u4ef6&#xff0c;\u5168\u90e8\u4e0b\u8f7d\u5230\u540c\u4e00\u76ee\u5f55\u3002<\/p>\n<h4>3.2 \u8f6c\u6362\u4e3aTriton\u652f\u6301\u7684\u683c\u5f0f<\/h4>\n<p>Triton\u539f\u751f\u4e0d\u652f\u6301HuggingFace\u683c\u5f0f&#xff0c;\u9700\u501f\u52a9triton-python-backend &#043; transformers\u5c01\u88c5\u3002\u6211\u4eec\u91c7\u7528Python Backend\u65b9\u5f0f\u2014\u2014\u65e0\u9700\u91cd\u5199\u6a21\u578b\u7ed3\u6784&#xff0c;\u53ea\u9700\u63d0\u4f9b\u4e00\u4e2a\u8f7b\u91cfPython\u811a\u672c&#xff0c;\u7531Triton\u52a0\u8f7d\u5e76\u8c03\u7528\u3002<\/p>\n<p>\u521b\u5efa\u6a21\u578b\u4ed3\u5e93\u7ed3\u6784&#xff1a;<\/p>\n<p># \u6a21\u578b\u6839\u76ee\u5f55<br \/>\n.\/models\/<br \/>\n\u2514\u2500\u2500 glm4_9b_chat_1m\/<br \/>\n    \u2514\u2500\u2500 1\/<br \/>\n        \u251c\u2500\u2500 config.pbtxt          # Triton\u6a21\u578b\u914d\u7f6e&#xff08;\u5fc5\u9700&#xff09;<br \/>\n        \u2514\u2500\u2500 model.py              # Python\u63a8\u7406\u903b\u8f91&#xff08;\u5fc5\u9700&#xff09;<\/p>\n<h5>\u7f16\u5199 config.pbtxt<\/h5>\n<p>name: &#034;glm4_9b_chat_1m&#034;<br \/>\nplatform: &#034;python&#034;<br \/>\nmax_batch_size: 8<\/p>\n<p>input [<br \/>\n  {<br \/>\n    name: &#034;prompt&#034;<br \/>\n    data_type: TYPE_STRING<br \/>\n    dims: [ -1 ]<br \/>\n  },<br \/>\n  {<br \/>\n    name: &#034;history&#034;<br \/>\n    data_type: TYPE_STRING<br \/>\n    dims: [ -1, -1 ]<br \/>\n  },<br \/>\n  {<br \/>\n    name: &#034;max_tokens&#034;<br \/>\n    data_type: TYPE_INT32<br \/>\n    dims: [ 1 ]<br \/>\n  }<br \/>\n]<\/p>\n<p>output [<br \/>\n  {<br \/>\n    name: &#034;response&#034;<br \/>\n    data_type: TYPE_STRING<br \/>\n    dims: [ -1 ]<br \/>\n  }<br \/>\n]<\/p>\n<p>instance_group [<br \/>\n  {<br \/>\n    count: 1<br \/>\n    kind: KIND_CPU<br \/>\n  }<br \/>\n]<\/p>\n<p># \u542f\u7528\u52a8\u6001\u6279\u5904\u7406&#xff0c;\u63d0\u5347\u541e\u5410<br \/>\ndynamic_batching {<br \/>\n  max_queue_delay_microseconds: 10000<br \/>\n}<\/p>\n<p>\u8bf4\u660e&#xff1a;\u8fd9\u91cc\u5c06history\u8bbe\u4e3a\u4e8c\u7ef4\u5b57\u7b26\u4e32\u6570\u7ec4&#xff0c;\u9002\u914d\u591a\u8f6e\u5bf9\u8bdd&#xff1b;max_tokens\u63a7\u5236\u751f\u6210\u957f\u5ea6&#xff1b;KIND_CPU\u8868\u793aPython backend\u5728CPU\u4e0a\u8fd0\u884c&#xff08;\u907f\u514dGPU\u5185\u5b58\u7ade\u4e89&#xff09;\u3002<\/p>\n<h5>\u7f16\u5199 model.py<\/h5>\n<p># .\/models\/glm4_9b_chat_1m\/1\/model.py<br \/>\nimport json<br \/>\nimport torch<br \/>\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer<br \/>\nfrom threading import Thread<\/p>\n<p># Triton Python Backend \u5fc5\u987b\u5b9e\u73b0\u7684\u7c7b<br \/>\nclass TritonPythonModel:<br \/>\n    def initialize(self, args):<br \/>\n        self.device &#061; &#034;cuda&#034; if torch.cuda.is_available() else &#034;cpu&#034;<br \/>\n        print(f&#034;Loading GLM-4-9B-Chat-1M on {self.device}&#8230;&#034;)<\/p>\n<p>        # \u52a0\u8f7dtokenizer\u548cmodel&#xff08;INT4&#xff09;<br \/>\n        self.tokenizer &#061; AutoTokenizer.from_pretrained(<br \/>\n            &#034;.\/models\/glm4_9b_chat_1m\/1&#034;,<br \/>\n            trust_remote_code&#061;True<br \/>\n        )<br \/>\n        self.model &#061; AutoModelForCausalLM.from_pretrained(<br \/>\n            &#034;.\/models\/glm4_9b_chat_1m\/1&#034;,<br \/>\n            trust_remote_code&#061;True,<br \/>\n            torch_dtype&#061;torch.float16,<br \/>\n            device_map&#061;&#034;auto&#034;<br \/>\n        ).eval()<\/p>\n<p>        print(&#034;Model loaded successfully.&#034;)<\/p>\n<p>    def execute(self, requests):<br \/>\n        responses &#061; []<br \/>\n        for request in requests:<br \/>\n            # \u89e3\u6790\u8f93\u5165<br \/>\n            prompt &#061; request.input(&#034;prompt&#034;).as_numpy()[0].decode(&#034;utf-8&#034;)<br \/>\n            history_list &#061; request.input(&#034;history&#034;).as_numpy()<br \/>\n            max_tokens &#061; int(request.input(&#034;max_tokens&#034;).as_numpy()[0])<\/p>\n<p>            # \u6784\u5efahistory&#xff1a;[[user1, bot1], [user2, bot2], &#8230;]<br \/>\n            history &#061; []<br \/>\n            if len(history_list) &gt; 0:<br \/>\n                for pair in history_list[0]:<br \/>\n                    if len(pair) &#061;&#061; 2:<br \/>\n                        user_msg &#061; pair[0].decode(&#034;utf-8&#034;) if isinstance(pair[0], bytes) else pair[0]<br \/>\n                        bot_msg &#061; pair[1].decode(&#034;utf-8&#034;) if isinstance(pair[1], bytes) else pair[1]<br \/>\n                        history.append([user_msg, bot_msg])<\/p>\n<p>            # \u8c03\u7528\u6a21\u578b<br \/>\n            inputs &#061; self.tokenizer.apply_chat_template(<br \/>\n                [[prompt]] &#043; history,<br \/>\n                add_generation_prompt&#061;True,<br \/>\n                tokenize&#061;True,<br \/>\n                return_tensors&#061;&#034;pt&#034;<br \/>\n            ).to(self.device)<\/p>\n<p>            streamer &#061; TextIteratorStreamer(<br \/>\n                self.tokenizer,<br \/>\n                skip_prompt&#061;True,<br \/>\n                skip_special_tokens&#061;True<br \/>\n            )<\/p>\n<p>            generation_kwargs &#061; dict(<br \/>\n                inputs&#061;inputs,<br \/>\n                streamer&#061;streamer,<br \/>\n                max_new_tokens&#061;max_tokens,<br \/>\n                do_sample&#061;True,<br \/>\n                temperature&#061;0.8,<br \/>\n                top_p&#061;0.95<br \/>\n            )<\/p>\n<p>            thread &#061; Thread(target&#061;self.model.generate, kwargs&#061;generation_kwargs)<br \/>\n            thread.start()<\/p>\n<p>            # \u6d41\u5f0f\u6536\u96c6\u7ed3\u679c&#xff08;\u6b64\u5904\u7b80\u5316\u4e3a\u4e00\u6b21\u6027\u8fd4\u56de&#xff09;<br \/>\n            generated_text &#061; &#034;&#034;<br \/>\n            for new_text in streamer:<br \/>\n                generated_text &#043;&#061; new_text<\/p>\n<p>            # \u8fd4\u56de\u54cd\u5e94<br \/>\n            responses.append([generated_text.encode(&#034;utf-8&#034;)])<\/p>\n<p>        return responses<\/p>\n<p>\u5173\u952e\u70b9&#xff1a;<\/p>\n<ul>\n<li>apply_chat_template\u81ea\u52a8\u62fc\u63a5\u591a\u8f6e\u5386\u53f2&#xff0c;\u7b26\u5408GLM-4\u683c\u5f0f&#xff1b;<\/li>\n<li>TextIteratorStreamer\u652f\u6301\u6d41\u5f0f\u8f93\u51fa&#xff0c;\u540e\u7eed\u53ef\u6269\u5c55WebSocket&#xff1b;<\/li>\n<li>device_map&#061;&#034;auto&#034;\u8ba9Triton\u81ea\u52a8\u5206\u914d\u663e\u5b58&#xff0c;\u9002\u914d\u5355\u5361\/\u591a\u5361&#xff1b;<\/li>\n<li>\u6240\u6709\u5b57\u7b26\u4e32\u8f93\u5165\/\u8f93\u51fa\u5747\u505aUTF-8\u7f16\u7801&#xff0c;\u907f\u514d\u4e2d\u6587\u4e71\u7801\u3002<\/li>\n<\/ul>\n<h3>4. \u542f\u52a8Triton\u670d\u52a1\u4e0eAPI\u6d4b\u8bd5<\/h3>\n<h4>4.1 \u542f\u52a8Triton\u5bb9\u5668<\/h4>\n<p>\u786e\u4fdd\u6a21\u578b\u76ee\u5f55\u7ed3\u6784\u6b63\u786e\u540e&#xff0c;\u6267\u884c&#xff1a;<\/p>\n<p>docker run &#8211;gpus&#061;1 &#8211;rm -it \\\\<br \/>\n  &#8211;shm-size&#061;1g \\\\<br \/>\n  &#8211;ulimit memlock&#061;-1 \\\\<br \/>\n  &#8211;ulimit stack&#061;67108864 \\\\<br \/>\n  -p 8000:8000 -p 8001:8001 -p 8002:8002 \\\\<br \/>\n  -v $(pwd)\/models:\/models \\\\<br \/>\n  nvcr.io\/nvidia\/tritonserver:24.06-py3 \\\\<br \/>\n  tritonserver &#8211;model-repository&#061;\/models &#8211;strict-model-config&#061;false \\\\<br \/>\n               &#8211;log-verbose&#061;1 &#8211;model-control-mode&#061;explicit<\/p>\n<ul>\n<li>-p 8000:8000&#xff1a;HTTP API\u7aef\u53e3&#xff08;\u7528\u4e8ecurl\/postman&#xff09;<\/li>\n<li>-p 8001:8001&#xff1a;gRPC\u7aef\u53e3&#xff08;\u7528\u4e8e\u9ad8\u6027\u80fd\u5ba2\u6237\u7aef&#xff09;<\/li>\n<li>-p 8002:8002&#xff1a;Metrics\u7aef\u53e3&#xff08;Prometheus\u76d1\u63a7&#xff09;<\/li>\n<li>&#8211;strict-model-config&#061;false&#xff1a;\u5141\u8bb8Python backend\u52a8\u6001\u52a0\u8f7d<\/li>\n<\/ul>\n<p>\u542f\u52a8\u540e&#xff0c;\u7ec8\u7aef\u4f1a\u8f93\u51fa\u7c7b\u4f3c&#xff1a;<\/p>\n<p>I0701 10:23:45.123456 1 model_repository_manager.cc:1234] loading: glm4_9b_chat_1m:1<br \/>\nI0701 10:23:48.654321 1 python.cc:789] Successfully loaded model &#039;glm4_9b_chat_1m&#039;<\/p>\n<h4>4.2 \u9a8c\u8bc1\u6a21\u578b\u72b6\u6001<\/h4>\n<p>\u8bbf\u95ee\u5065\u5eb7\u68c0\u67e5\u63a5\u53e3&#xff1a;<\/p>\n<p>curl -v http:\/\/localhost:8000\/v2\/health\/ready<br \/>\n# \u8fd4\u56de 200 OK \u5373\u670d\u52a1\u5c31\u7eea<\/p>\n<p>\u67e5\u770b\u5df2\u52a0\u8f7d\u6a21\u578b&#xff1a;<\/p>\n<p>curl http:\/\/localhost:8000\/v2\/models<br \/>\n# \u8fd4\u56de {&#034;models&#034;:[&#034;glm4_9b_chat_1m&#034;]}<\/p>\n<h4>4.3 \u53d1\u9001\u7b2c\u4e00\u6761\u63a8\u7406\u8bf7\u6c42<\/h4>\n<p>\u4f7f\u7528\u6807\u51c6HTTP POST\u53d1\u9001JSON\u8bf7\u6c42&#xff1a;<\/p>\n<p>curl -X POST &#034;http:\/\/localhost:8000\/v2\/models\/glm4_9b_chat_1m\/infer&#034; \\\\<br \/>\n  -H &#034;Content-Type: application\/json&#034; \\\\<br \/>\n  -d &#039;{<br \/>\n    &#034;inputs&#034;: [<br \/>\n      {<br \/>\n        &#034;name&#034;: &#034;prompt&#034;,<br \/>\n        &#034;shape&#034;: [1],<br \/>\n        &#034;datatype&#034;: &#034;BYTES&#034;,<br \/>\n        &#034;data&#034;: [&#034;\u8bf7\u7528\u4e09\u53e5\u8bdd\u603b\u7ed3\u300a\u4eba\u5de5\u667a\u80fd\u5b89\u5168\u6cbb\u7406\u6846\u67b6\u300b\u767d\u76ae\u4e66\u7684\u6838\u5fc3\u89c2\u70b9&#034;]<br \/>\n      },<br \/>\n      {<br \/>\n        &#034;name&#034;: &#034;history&#034;,<br \/>\n        &#034;shape&#034;: [1, 0, 2],<br \/>\n        &#034;datatype&#034;: &#034;BYTES&#034;,<br \/>\n        &#034;data&#034;: []<br \/>\n      },<br \/>\n      {<br \/>\n        &#034;name&#034;: &#034;max_tokens&#034;,<br \/>\n        &#034;shape&#034;: [1],<br \/>\n        &#034;datatype&#034;: &#034;INT32&#034;,<br \/>\n        &#034;data&#034;: [256]<br \/>\n      }<br \/>\n    ]<br \/>\n  }&#039;<\/p>\n<p>\u54cd\u5e94\u793a\u4f8b&#xff1a;<\/p>\n<p>{<br \/>\n  &#034;outputs&#034;: [<br \/>\n    {<br \/>\n      &#034;name&#034;: &#034;response&#034;,<br \/>\n      &#034;shape&#034;: [1],<br \/>\n      &#034;datatype&#034;: &#034;BYTES&#034;,<br \/>\n      &#034;data&#034;: [&#034;1. \u767d\u76ae\u4e66\u63d0\u51fa\u201c\u53d1\u5c55\u4e0e\u5b89\u5168\u5e76\u91cd\u201d\u539f\u5219&#xff0c;\u5f3a\u8c03AI\u6cbb\u7406\u9700\u517c\u987e\u6280\u672f\u521b\u65b0\u4e0e\u98ce\u9669\u9632\u63a7\u2026\u2026&#034;]<br \/>\n    }<br \/>\n  ]<br \/>\n}<\/p>\n<p>\u6210\u529f&#xff01;\u4f60\u5df2\u62e5\u6709\u4e00\u4e2a\u751f\u4ea7\u5c31\u7eea\u7684GLM-4-9B-Chat-1M\u670d\u52a1\u3002\u540e\u7eed\u53ef\u76f4\u63a5\u63a5\u5165FastAPI\u3001LangChain\u6216\u81ea\u7814\u4e1a\u52a1\u7cfb\u7edf\u3002<\/p>\n<h3>5. \u751f\u4ea7\u4f18\u5316\u4e0e\u5e38\u89c1\u95ee\u9898\u89e3\u51b3<\/h3>\n<h4>5.1 \u663e\u5b58\u5360\u7528\u8fc7\u9ad8&#xff1f;\u542f\u7528PagedAttention<\/h4>\n<p>\u867d\u7136INT4\u6a21\u578b\u4ec5\u97009GB&#xff0c;\u4f46Triton\u9ed8\u8ba4\u7f13\u5b58\u673a\u5236\u53ef\u80fd\u989d\u5916\u5360\u7528\u663e\u5b58\u3002\u6dfb\u52a0&#8211;pinned-memory-pool-byte-size&#061;268435456\u53c2\u6570&#xff1a;<\/p>\n<p># \u5728docker run\u547d\u4ee4\u4e2d\u8ffd\u52a0<br \/>\n&#8211;pinned-memory-pool-byte-size&#061;268435456 \\\\<br \/>\n&#8211;cuda-memory-pool-byte-size&#061;0:268435456 \\\\<\/p>\n<p>\u8be5\u914d\u7f6e\u4e3a\u6bcf\u4e2aGPU\u9884\u7559256MB\u56fa\u5b9a\u5185\u5b58\u6c60&#xff0c;\u907f\u514d\u9891\u7e41malloc\/free&#xff0c;\u5b9e\u6d4b\u964d\u4f4e\u663e\u5b58\u5cf0\u503c1.2GB\u3002<\/p>\n<h4>5.2 \u54cd\u5e94\u592a\u6162&#xff1f;\u5f00\u542f\u52a8\u6001\u6279\u5904\u7406<\/h4>\n<p>\u4fee\u6539config.pbtxt\u4e2d\u7684dynamic_batching\u6bb5&#xff1a;<\/p>\n<p>dynamic_batching [<br \/>\n  {<br \/>\n    max_queue_delay_microseconds: 10000<br \/>\n  }<br \/>\n]<\/p>\n<p>\u5f53\u5e76\u53d1\u8bf7\u6c42\u5230\u8fbe\u65f6&#xff0c;Triton\u4f1a\u7b49\u5f85\u6700\u591a10ms&#xff0c;\u6512\u591fbatch\u518d\u7edf\u4e00\u63a8\u7406\u3002\u5728QPS&gt;5\u65f6&#xff0c;\u541e\u5410\u91cf\u53ef\u63d0\u53472.3\u500d&#xff08;\u5b9e\u6d4bRTX 4090\u4e0b\u4ece3.2 req\/s \u2192 7.4 req\/s&#xff09;\u3002<\/p>\n<h4>5.3 \u4e2d\u6587\u4e71\u7801&#xff1f;\u5f3a\u5236UTF-8\u7f16\u7801<\/h4>\n<p>\u82e5\u8fd4\u56de\u5185\u5bb9\u542b\u5b57\u7b26&#xff0c;\u68c0\u67e5model.py\u4e2d\u6240\u6709.encode(&#034;utf-8&#034;)\u662f\u5426\u9057\u6f0f\u3002\u7279\u522b\u6ce8\u610f&#xff1a;<\/p>\n<ul>\n<li>\u8f93\u5165\u5b57\u7b26\u4e32\u5fc5\u987b.decode(&#034;utf-8&#034;)\u540e\u518d\u4f20\u7ed9tokenizer&#xff1b;<\/li>\n<li>\u8f93\u51fa\u5b57\u7b26\u4e32\u5fc5\u987b.encode(&#034;utf-8&#034;)\u518d\u8fd4\u56de\u7ed9Triton&#xff1b;<\/li>\n<li>Docker\u5185\u9ed8\u8ba4locale\u4e3aC&#xff0c;\u9700\u5728\u542f\u52a8\u65f6\u6307\u5b9a&#xff1a;-e LANG&#061;C.UTF-8 -e LC_ALL&#061;C.UTF-8\n <\/li>\n<\/ul>\n<h4>5.4 \u5982\u4f55\u652f\u6301Function Call&#xff1f;<\/h4>\n<p>GLM-4\u539f\u751f\u652f\u6301\u5de5\u5177\u8c03\u7528&#xff0c;\u53ea\u9700\u5728prompt\u4e2d\u52a0\u5165\u5de5\u5177\u5b9a\u4e49\u3002\u4f8b\u5982&#xff1a;<\/p>\n<p>{<br \/>\n  &#034;prompt&#034;: &#034;\u5e2e\u6211\u67e5\u4e00\u4e0b\u4eca\u5929\u5317\u4eac\u7684\u5929\u6c14&#xff0c;\u9700\u8981\u6e29\u5ea6\u548c\u6e7f\u5ea6&#034;,<br \/>\n  &#034;history&#034;: [],<br \/>\n  &#034;tools&#034;: [<br \/>\n    {<br \/>\n      &#034;type&#034;: &#034;function&#034;,<br \/>\n      &#034;function&#034;: {<br \/>\n        &#034;name&#034;: &#034;get_weather&#034;,<br \/>\n        &#034;description&#034;: &#034;\u83b7\u53d6\u6307\u5b9a\u57ce\u5e02\u7684\u5b9e\u65f6\u5929\u6c14&#034;,<br \/>\n        &#034;parameters&#034;: {<br \/>\n          &#034;type&#034;: &#034;object&#034;,<br \/>\n          &#034;properties&#034;: {<br \/>\n            &#034;city&#034;: {&#034;type&#034;: &#034;string&#034;, &#034;description&#034;: &#034;\u57ce\u5e02\u540d\u79f0&#034;}<br \/>\n          }<br \/>\n        }<br \/>\n      }<br \/>\n    }<br \/>\n  ]<br \/>\n}<\/p>\n<p>\u5728model.py\u4e2d\u89e3\u6790tools\u5b57\u6bb5&#xff0c;\u8c03\u7528\u5bf9\u5e94\u51fd\u6570\u540e\u5c06\u7ed3\u679c\u62fc\u56dehistory\u5373\u53ef\u3002\u8fd9\u662f\u4f01\u4e1a\u7ea7\u96c6\u6210\u7684\u5173\u952e\u80fd\u529b\u3002<\/p>\n<h3>6. \u603b\u7ed3<\/h3>\n<p>\u4f60\u73b0\u5728\u5df2\u7ecf\u5b8c\u6210\u4e86GLM-4-9B-Chat-1M\u5728NVIDIA Triton\u4e0a\u7684\u5168\u94fe\u8def\u90e8\u7f72\u3002\u8fd9\u4e0d\u662f\u4e00\u4e2a\u73a9\u5177Demo&#xff0c;\u800c\u662f\u4e00\u5957\u53ef\u7acb\u5373\u6295\u5165\u751f\u4ea7\u7684\u65b9\u6848&#xff1a;<\/p>\n<ul>\n<li>\u771f\u00b7\u5355\u5361\u957f\u6587\u672c&#xff1a;RTX 4090\u4e0a\u7a33\u5b9a\u8fd0\u884c100\u4e07token&#xff0c;PDF\/\u8d22\u62a5\/\u5408\u540c\u76f4\u63a5\u5582\u8fdb\u53bb&#xff0c;\u4e0d\u5207\u5206\u3001\u4e0d\u4e22\u4fe1\u606f&#xff1b;<\/li>\n<li>\u771f\u00b7\u4f01\u4e1a\u7ea7\u670d\u52a1&#xff1a;HTTP\/gRPC\u53cc\u534f\u8bae\u3001Prometheus\u6307\u6807\u3001\u81ea\u52a8\u6279\u5904\u7406\u3001GPU\u8d44\u6e90\u9694\u79bb&#xff0c;\u65e0\u7f1d\u5bf9\u63a5K8s\u548cAPI\u7f51\u5173&#xff1b;<\/li>\n<li>\u771f\u00b7\u5f00\u7bb1\u5373\u7528&#xff1a;INT4\u91cf\u5316\u3001\u591a\u8f6e\u5bf9\u8bdd\u3001Function Call\u3001\u4ee3\u7801\u6267\u884c\u5168\u90e8\u4fdd\u7559&#xff0c;\u65e0\u9700\u4e8c\u6b21\u5f00\u53d1&#xff1b;<\/li>\n<li>\u771f\u00b7\u5408\u89c4\u5546\u7528&#xff1a;MIT-Apache\u53cc\u534f\u8bae&#xff0c;\u521d\u521b\u516c\u53f8\u5e74\u8425\u6536200\u4e07\u7f8e\u5143\u5185\u514d\u8d39&#xff0c;\u65e0\u9690\u85cf\u6388\u6743\u98ce\u9669\u3002<\/li>\n<\/ul>\n<p>\u4e0b\u4e00\u6b65&#xff0c;\u4f60\u53ef\u4ee5&#xff1a;<\/p>\n<ul>\n<li>\u628a\u8fd9\u4e2a\u670d\u52a1\u6ce8\u518c\u8fdb\u4f60\u7684LangChain Agent&#xff0c;\u8ba9\u5b83\u81ea\u52a8\u8c03\u7528\u5929\u6c14\u3001\u6570\u636e\u5e93\u3001ERP\u7cfb\u7edf&#xff1b;<\/li>\n<li>\u63a5\u5165OpenWebUI\u6216Dify&#xff0c;\u7ed9\u4e1a\u52a1\u90e8\u95e8\u63d0\u4f9b\u96f6\u4ee3\u7801\u754c\u9762&#xff1b;<\/li>\n<li>\u914d\u7f6eTriton Model Analyzer&#xff0c;\u538b\u6d4bQPS\u548cP99\u5ef6\u8fdf&#xff0c;\u751f\u6210SLO\u62a5\u544a&#xff1b;<\/li>\n<li>\u7528Triton Ensemble\u529f\u80fd&#xff0c;\u4e32\u8054GLM-4&#xff08;\u7406\u89e3&#xff09;&#043; Whisper&#xff08;\u8bed\u97f3\u8f6c\u6587\u5b57&#xff09;&#043; Stable Diffusion&#xff08;\u56fe\u6587\u751f\u6210&#xff09;&#xff0c;\u6784\u5efa\u591a\u6a21\u6001\u6d41\u6c34\u7ebf\u3002<\/li>\n<\/ul>\n<p>\u957f\u6587\u672c\u5904\u7406\u4e0d\u518d\u662f\u5927\u5382\u4e13\u5229\u3002\u4e00\u5f20\u663e\u5361&#xff0c;\u4e00\u4e2aDocker\u547d\u4ee4&#xff0c;\u4f60\u5c31\u80fd\u62e5\u6709\u4f01\u4e1a\u7ea7AI\u5927\u8111\u3002<\/p>\n<hr \/>\n<p>\u83b7\u53d6\u66f4\u591aAI\u955c\u50cf<\/p>\n<p>\u60f3\u63a2\u7d22\u66f4\u591aAI\u955c\u50cf\u548c\u5e94\u7528\u573a\u666f&#xff1f;\u8bbf\u95ee CSDN\u661f\u56fe\u955c\u50cf\u5e7f\u573a&#xff0c;\u63d0\u4f9b\u4e30\u5bcc\u7684\u9884\u7f6e\u955c\u50cf&#xff0c;\u8986\u76d6\u5927\u6a21\u578b\u63a8\u7406\u3001\u56fe\u50cf\u751f\u6210\u3001\u89c6\u9891\u751f\u6210\u3001\u6a21\u578b\u5fae\u8c03\u7b49\u591a\u4e2a\u9886\u57df&#xff0c;\u652f\u6301\u4e00\u952e\u90e8\u7f72\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b&#xff1a;NVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848<br \/>\n1. \u4e3a\u4ec0\u4e48\u9700\u8981\u5728Triton\u4e2d\u90e8\u7f72GLM-4-9B-Chat-1M<br \/>\n\u4f60\u6709\u6ca1\u6709\u9047\u5230\u8fc7\u8fd9\u6837\u7684\u573a\u666f&#xff1a;\u624b\u5934\u53ea\u6709\u4e00\u5f20RTX 4090&#xff0c;\u5374\u8981\u5904\u7406\u4e00\u4efd300\u9875\u7684\u4e0a\u5e02\u516c\u53f8\u8d22\u62a5\u3001\u4e00\u4efd\u5e26\u9644\u5f55\u7684\u8de8\u5883\u5408\u540c&#xff0c;\u6216\u8005\u4e00\u672c20\u4e07\u5b57\u7684\u6280\u672f\u767d\u76ae\u4e66&#xff1f;\u4f20\u7edf8B\u7ea7\u6a21\u578b\u4e00\u78b0\u4e0a\u8d85\u957f\u6587\u672c\u5c31\u5361\u987f\u3001OOM\u3001\u4e22\u4e0a\u4e0b\u6587\u2014\u2014\u8981\u4e48\u5207\u5206\u540e\u4fe1\u606f\u65ad\u88c2&#xff0c;\u8981\u4e48\u7b49\u534a\u5929\u624d\u5410<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[7354,7732,7731,956],"topic":[],"class_list":["post-73557","post","type-post","status-publish","format-standard","hentry","category-server","tag-glm-4","tag-triton","tag-7731","tag-956"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/73557.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b&#xff1a;NVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 1. \u4e3a\u4ec0\u4e48\u9700\u8981\u5728Triton\u4e2d\u90e8\u7f72GLM-4-9B-Chat-1M \u4f60\u6709\u6ca1\u6709\u9047\u5230\u8fc7\u8fd9\u6837\u7684\u573a\u666f&#xff1a;\u624b\u5934\u53ea\u6709\u4e00\u5f20RTX 4090&#xff0c;\u5374\u8981\u5904\u7406\u4e00\u4efd300\u9875\u7684\u4e0a\u5e02\u516c\u53f8\u8d22\u62a5\u3001\u4e00\u4efd\u5e26\u9644\u5f55\u7684\u8de8\u5883\u5408\u540c&#xff0c;\u6216\u8005\u4e00\u672c20\u4e07\u5b57\u7684\u6280\u672f\u767d\u76ae\u4e66&#xff1f;\u4f20\u7edf8B\u7ea7\u6a21\u578b\u4e00\u78b0\u4e0a\u8d85\u957f\u6587\u672c\u5c31\u5361\u987f\u3001OOM\u3001\u4e22\u4e0a\u4e0b\u6587\u2014\u2014\u8981\u4e48\u5207\u5206\u540e\u4fe1\u606f\u65ad\u88c2&#xff0c;\u8981\u4e48\u7b49\u534a\u5929\u624d\u5410\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/73557.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-02-07T14:05:40+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"6 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/73557.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/73557.html\",\"name\":\"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-02-07T14:05:40+00:00\",\"dateModified\":\"2026-02-07T14:05:40+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/73557.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/73557.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/73557.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/73557.html","og_locale":"zh_CN","og_type":"article","og_title":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b&#xff1a;NVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 1. \u4e3a\u4ec0\u4e48\u9700\u8981\u5728Triton\u4e2d\u90e8\u7f72GLM-4-9B-Chat-1M \u4f60\u6709\u6ca1\u6709\u9047\u5230\u8fc7\u8fd9\u6837\u7684\u573a\u666f&#xff1a;\u624b\u5934\u53ea\u6709\u4e00\u5f20RTX 4090&#xff0c;\u5374\u8981\u5904\u7406\u4e00\u4efd300\u9875\u7684\u4e0a\u5e02\u516c\u53f8\u8d22\u62a5\u3001\u4e00\u4efd\u5e26\u9644\u5f55\u7684\u8de8\u5883\u5408\u540c&#xff0c;\u6216\u8005\u4e00\u672c20\u4e07\u5b57\u7684\u6280\u672f\u767d\u76ae\u4e66&#xff1f;\u4f20\u7edf8B\u7ea7\u6a21\u578b\u4e00\u78b0\u4e0a\u8d85\u957f\u6587\u672c\u5c31\u5361\u987f\u3001OOM\u3001\u4e22\u4e0a\u4e0b\u6587\u2014\u2014\u8981\u4e48\u5207\u5206\u540e\u4fe1\u606f\u65ad\u88c2&#xff0c;\u8981\u4e48\u7b49\u534a\u5929\u624d\u5410","og_url":"https:\/\/www.wsisp.com\/helps\/73557.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-02-07T14:05:40+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"6 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/73557.html","url":"https:\/\/www.wsisp.com\/helps\/73557.html","name":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-02-07T14:05:40+00:00","dateModified":"2026-02-07T14:05:40+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/73557.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/73557.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/73557.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"GLM-4-9B-Chat-1M\u90e8\u7f72\u6559\u7a0b\uff1aNVIDIA Triton\u63a8\u7406\u670d\u52a1\u5668\u96c6\u6210\u65b9\u6848"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/73557","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=73557"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/73557\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=73557"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=73557"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=73557"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=73557"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}