{"id":56923,"date":"2025-08-15T01:14:54","date_gmt":"2025-08-14T17:14:54","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/56923.html"},"modified":"2025-08-15T01:14:54","modified_gmt":"2025-08-14T17:14:54","slug":"qwen-image%e6%b7%b1%e5%ba%a6%e8%a7%a3%e6%9e%90%ef%bc%9a%e7%aa%81%e7%a0%b4%e6%96%87%e6%9c%ac%e6%b8%b2%e6%9f%93%e4%b8%8e%e5%9b%be%e5%83%8f%e7%bc%96%e8%be%91%e7%9a%84%e8%a7%86%e8%a7%89%e7%94%9f%e6%88%90","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/56923.html","title":{"rendered":"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b"},"content":{"rendered":"<h3>Qwen-Image\u6df1\u5ea6\u89e3\u6790&#xff1a;\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b<\/h3>\n<h4>1 \u5f15\u8a00&#xff1a;\u89c6\u89c9\u751f\u6210\u6a21\u578b\u7684\u6311\u6218\u4e0e\u7a81\u7834<\/h4>\n<p>\u5728AI\u751f\u6210\u5185\u5bb9\u9886\u57df&#xff0c;\u56fe\u50cf\u751f\u6210\u6a21\u578b\u9762\u4e34\u4e24\u5927\u6838\u5fc3\u6311\u6218&#xff1a;\u590d\u6742\u6587\u672c\u6e32\u67d3\u548c\u7f16\u8f91\u4e00\u81f4\u6027\u3002\u4f20\u7edf\u6a21\u578b\u5728\u5b57\u6bcd\u6587\u5b57&#xff08;\u5982\u82f1\u8bed&#xff09;\u4e0a\u8868\u73b0\u5c1a\u53ef&#xff0c;\u4f46\u5728\u8868\u610f\u6587\u5b57&#xff08;\u5982\u4e2d\u6587&#xff09;\u573a\u666f\u4e2d\u5e38\u51fa\u73b0\u6587\u5b57\u626d\u66f2\u3001\u7f3a\u5931\u95ee\u9898\u3002\u540c\u65f6&#xff0c;\u56fe\u50cf\u7f16\u8f91\u4efb\u52a1\u4e2d\u4fdd\u6301\u539f\u59cb\u56fe\u50cf\u8bed\u4e49\u8fde\u8d2f\u6027\u548c\u89c6\u89c9\u7ec6\u8282\u4e00\u81f4\u6027\u66f4\u662f\u4e1a\u754c\u96be\u9898\u3002<\/p>\n<p>Qwen-Image\u4f5c\u4e3aQwen\u7cfb\u5217\u9996\u4e2a\u56fe\u50cf\u751f\u6210\u57fa\u7840\u6a21\u578b&#xff0c;\u901a\u8fc7\u4e09\u5927\u521b\u65b0\u7a81\u7834\u8fd9\u4e9b\u74f6\u9888&#xff1a;<\/p>\n<li>\u6e10\u8fdb\u5f0f\u6587\u672c\u6e32\u67d3\u8bad\u7ec3&#xff1a;\u4ece\u57fa\u7840\u5b57\u7b26\u5230\u6bb5\u843d\u7ea7\u63cf\u8ff0\u7684\u8bfe\u7a0b\u5b66\u4e60<\/li>\n<li>\u53cc\u7f16\u7801\u673a\u5236&#xff1a;Qwen2.5-VL\u8bed\u4e49\u7f16\u7801 &#043; VAE\u89c6\u89c9\u7f16\u7801\u7684\u534f\u540c<\/li>\n<li>\u591a\u4efb\u52a1\u7edf\u4e00\u67b6\u6784&#xff1a;\u6587\u672c\u5230\u56fe\u50cf(T2I)\u3001\u56fe\u50cf\u5230\u56fe\u50cf(I2I)\u3001\u6587\u672c-\u56fe\u50cf\u5230\u56fe\u50cf(TI2I)\u7684\u8054\u5408\u8bad\u7ec3<\/li>\n<p><img decoding=\"async\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/08\/20250814171450-689e198ae2a8a.png\" alt=\"\u5728\u8fd9\u91cc\u63d2\u5165\u56fe\u7247\u63cf\u8ff0\" \/><\/p>\n<p>\u56fe1&#xff1a;Qwen-Image\u53cc\u6d41\u67b6\u6784&#xff1a;\u6587\u672c\u6d41&#xff08;\u5de6&#xff09;\u4e0e\u56fe\u50cf\u6d41&#xff08;\u53f3&#xff09;\u7684\u534f\u540c\u5de5\u4f5c<\/p>\n<h4>2 \u6a21\u578b\u67b6\u6784\u8bbe\u8ba1<\/h4>\n<h5>2.1 \u6574\u4f53\u6846\u67b6<\/h5>\n<p>Qwen-Image\u57fa\u4e8e\u4e09\u6a21\u5757\u534f\u540c\u67b6\u6784&#xff1a;<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">QwenImage<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>mllm <span class=\"token operator\">&#061;<\/span> Qwen2_5VL<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u591a\u6a21\u6001\u8bed\u8a00\u6a21\u578b<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>vae <span class=\"token operator\">&#061;<\/span> HybridVAE<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>   <span class=\"token comment\"># \u6df7\u5408\u53d8\u5206\u81ea\u7f16\u7801\u5668<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>mmdit <span class=\"token operator\">&#061;<\/span> MMDiT<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>     <span class=\"token comment\"># \u591a\u6a21\u6001\u6269\u6563\u53d8\u6362\u5668<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">,<\/span> image<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token comment\"># \u8bed\u4e49\u7279\u5f81\u63d0\u53d6<\/span><br \/>\n        h_text <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>mllm<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span>  <\/p>\n<p>        <span class=\"token comment\"># \u89c6\u89c9\u7279\u5f81\u63d0\u53d6<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> image <span class=\"token keyword\">is<\/span> <span class=\"token keyword\">not<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            z_vae <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>vae<span class=\"token punctuation\">.<\/span>encode<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">)<\/span><br \/>\n            h_image <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>mllm<span class=\"token punctuation\">.<\/span>visual_encoder<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">)<\/span><br \/>\n            h <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>cat<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span>h_text<span class=\"token punctuation\">,<\/span> h_image<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            h <span class=\"token operator\">&#061;<\/span> h_text<\/p>\n<p>        <span class=\"token comment\"># \u6269\u6563\u751f\u6210<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>mmdit<span class=\"token punctuation\">(<\/span>h<span class=\"token punctuation\">,<\/span> z_vae<span class=\"token punctuation\">)<\/span><\/p>\n<h5>2.2 \u591a\u6a21\u6001\u8bed\u8a00\u6a21\u578b&#xff08;Qwen2.5-VL&#xff09;<\/h5>\n<p>\u4f5c\u4e3a\u6761\u4ef6\u7f16\u7801\u5668&#xff0c;\u5176\u6838\u5fc3\u4f18\u52bf\u5728\u4e8e&#xff1a;<\/p>\n<li>\u89c6\u89c9-\u8bed\u8a00\u7a7a\u95f4\u5bf9\u9f50<\/li>\n<li>\u4fdd\u7559\u5f3a\u8bed\u8a00\u5efa\u6a21\u80fd\u529b<\/li>\n<li>\u652f\u6301\u591a\u6a21\u6001\u8f93\u5165<\/li>\n<p>\u7cfb\u7edf\u63d0\u793a\u6a21\u677f\u8bbe\u8ba1&#xff1a;<\/p>\n<p><span class=\"token comment\"># \u6587\u672c\u5230\u56fe\u50cf\u4efb\u52a1\u63d0\u793a<\/span><br \/>\nt2i_prompt <span class=\"token operator\">&#061;<\/span> <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;<br \/>\n&lt;|im_start|&gt;system<br \/>\n\u63cf\u8ff0\u56fe\u50cf&#xff0c;\u8be6\u7ec6\u8bf4\u660e\u989c\u8272\u3001\u6570\u91cf\u3001\u6587\u5b57\u3001\u5f62\u72b6\u3001\u5927\u5c0f\u3001\u7eb9\u7406\u3001\u7269\u4f53\u548c\u80cc\u666f\u7684\u7a7a\u95f4\u5173\u7cfb&#xff1a;&lt;|im_end|&gt;<br \/>\n&lt;|im_start|&gt;user<br \/>\n{user_text}&lt;|im_end|&gt;<br \/>\n&lt;|im_start|&gt;assistant<br \/>\n&#034;&#034;&#034;<\/span><\/p>\n<p><span class=\"token comment\"># \u56fe\u50cf\u7f16\u8f91\u4efb\u52a1\u63d0\u793a<\/span><br \/>\nt12i_prompt <span class=\"token operator\">&#061;<\/span> <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;<br \/>\n&lt;|im_start|&gt;system<br \/>\n\u63cf\u8ff0\u8f93\u5165\u56fe\u50cf\u7684\u5173\u952e\u7279\u5f81&#xff0c;\u7136\u540e\u8bf4\u660e\u7528\u6237\u7684\u6587\u5b57\u6307\u4ee4\u5982\u4f55\u4fee\u6539\u56fe\u50cf&lt;|im_end|&gt;<br \/>\n&lt;|im_start|&gt;user<br \/>\n{user_image}{user_text}&lt;|im_end|&gt;<br \/>\n&lt;|im_start|&gt;assistant<br \/>\n&#034;&#034;&#034;<\/span><\/p>\n<h5>2.3 \u53d8\u5206\u81ea\u7f16\u7801\u5668\u521b\u65b0<\/h5>\n<p>\u91c7\u7528\u5355\u7f16\u7801\u5668\u53cc\u89e3\u7801\u5668\u67b6\u6784&#xff0c;\u89e3\u51b3\u4f20\u7edfVAE\u7684\u6587\u672c\u91cd\u5efa\u7f3a\u9677&#xff1a;<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">HybridVAE<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>encoder <span class=\"token operator\">&#061;<\/span> 3DConvEncoder<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u5171\u4eab\u7f16\u7801\u5668<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>image_decoder <span class=\"token operator\">&#061;<\/span> ImageDecoder<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>video_decoder <span class=\"token operator\">&#061;<\/span> VideoDecoder<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> x<span class=\"token punctuation\">,<\/span> mode<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#039;image&#039;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        z <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>encoder<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">if<\/span> mode <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#039;image&#039;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>image_decoder<span class=\"token punctuation\">(<\/span>z<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">else<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># video<\/span><br \/>\n            <span class=\"token keyword\">return<\/span> self<span class=\"token punctuation\">.<\/span>video_decoder<span class=\"token punctuation\">(<\/span>z<span class=\"token punctuation\">)<\/span><\/p>\n<p>\u8bad\u7ec3\u5173\u952e\u53d1\u73b0&#xff1a;<\/p>\n<ul>\n<li>\u5e73\u8861\u91cd\u5efa\u635f\u5931(L1)\u4e0e\u611f\u77e5\u635f\u5931(VGG)\u51cf\u5c11\u4f2a\u5f71<\/li>\n<li>\u5bf9\u6297\u635f\u5931\u5728\u9ad8\u8d28\u91cf\u91cd\u5efa\u65f6\u6548\u679c\u51cf\u5f31<\/li>\n<li>\u4ec5\u5fae\u8c03\u89e3\u7801\u5668\u5373\u53ef\u63d0\u5347\u5c0f\u6587\u672c\u6e32\u67d3<\/li>\n<\/ul>\n<h5>2.4 \u591a\u6a21\u6001\u6269\u6563\u53d8\u6362\u5668<\/h5>\n<p>\u91c7\u7528MMDiT\u67b6\u6784\u5e76\u521b\u65b0MSRoPE\u4f4d\u7f6e\u7f16\u7801&#xff1a;<\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">MMDiT<\/span><span class=\"token punctuation\">(<\/span>nn<span class=\"token punctuation\">.<\/span>Module<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1024<\/span><span class=\"token punctuation\">,<\/span> heads<span class=\"token operator\">&#061;<\/span><span class=\"token number\">16<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token builtin\">super<\/span><span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>__init__<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>layers <span class=\"token operator\">&#061;<\/span> nn<span class=\"token punctuation\">.<\/span>ModuleList<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span><br \/>\n            DiTBlock<span class=\"token punctuation\">(<\/span>dim<span class=\"token punctuation\">,<\/span> heads<span class=\"token punctuation\">,<\/span> use_msrope<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">True<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">24<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">forward<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> h<span class=\"token punctuation\">,<\/span> z<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        x <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>ms_rope<span class=\"token punctuation\">(<\/span>h<span class=\"token punctuation\">,<\/span> z<span class=\"token punctuation\">)<\/span>  <span class=\"token comment\"># \u591a\u6a21\u6001\u4f4d\u7f6e\u7f16\u7801<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> layer <span class=\"token keyword\">in<\/span> self<span class=\"token punctuation\">.<\/span>layers<span class=\"token punctuation\">:<\/span><br \/>\n            x <span class=\"token operator\">&#061;<\/span> layer<span class=\"token punctuation\">(<\/span>x<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> x<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">ms_rope<\/span><span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">,<\/span> image_emb<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token triple-quoted-string string\">&#034;&#034;&#034;<br \/>\n    \u591a\u6a21\u6001\u53ef\u6269\u5c55\u65cb\u8f6c\u4f4d\u7f6e\u7f16\u7801<br \/>\n    text_emb: [B, L_t, D]<br \/>\n    image_emb: [B, H, W, D]<br \/>\n    &#034;&#034;&#034;<\/span><br \/>\n    <span class=\"token comment\"># \u6587\u672c\u89c6\u4e3a\u5bf9\u89d2\u7ebf\u5e03\u5c40<\/span><br \/>\n    text_pos <span class=\"token operator\">&#061;<\/span> diag_position<span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    text_rot <span class=\"token operator\">&#061;<\/span> apply_rope<span class=\"token punctuation\">(<\/span>text_emb<span class=\"token punctuation\">,<\/span> text_pos<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u56fe\u50cf\u4e2d\u5fc3\u8d77\u59cb\u7f16\u7801<\/span><br \/>\n    img_pos <span class=\"token operator\">&#061;<\/span> center_position<span class=\"token punctuation\">(<\/span>image_emb<span class=\"token punctuation\">.<\/span>shape<span class=\"token punctuation\">[<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">:<\/span><span class=\"token number\">3<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    img_rot <span class=\"token operator\">&#061;<\/span> apply_rope<span class=\"token punctuation\">(<\/span>image_emb<span class=\"token punctuation\">,<\/span> img_pos<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">return<\/span> torch<span class=\"token punctuation\">.<\/span>cat<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">[<\/span>text_rot<span class=\"token punctuation\">,<\/span> img_rot<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span> dim<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>\u56fe2&#xff1a;MSRoPE\u4f4d\u7f6e\u7f16\u7801\u7b56\u7565&#xff08;\u6587\u672c\u6cbf\u5bf9\u89d2\u7ebf&#xff0c;\u56fe\u50cf\u4ece\u4e2d\u5fc3\u8f90\u5c04&#xff09;<\/p>\n<h4>3 \u6570\u636e\u5904\u7406\u6d41\u7a0b<\/h4>\n<h5>3.1 \u6570\u636e\u6536\u96c6\u7b56\u7565<\/h5>\n<p>\u6784\u5efa\u767e\u4ebf\u7ea7\u56fe\u6587\u5bf9\u6570\u636e\u96c6&#xff0c;\u56db\u7c7b\u6570\u636e\u5206\u5e03&#xff1a;<\/p>\n<p>  #mermaid-svg-NgVuZR1w6VrznDR7 {font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg-NgVuZR1w6VrznDR7 .error-icon{fill:#552222;}#mermaid-svg-NgVuZR1w6VrznDR7 .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-NgVuZR1w6VrznDR7 .edge-thickness-normal{stroke-width:2px;}#mermaid-svg-NgVuZR1w6VrznDR7 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-NgVuZR1w6VrznDR7 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-NgVuZR1w6VrznDR7 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-NgVuZR1w6VrznDR7 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-NgVuZR1w6VrznDR7 .marker{fill:#333333;stroke:#333333;}#mermaid-svg-NgVuZR1w6VrznDR7 .marker.cross{stroke:#333333;}#mermaid-svg-NgVuZR1w6VrznDR7 svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-NgVuZR1w6VrznDR7 .pieCircle{stroke:black;stroke-width:2px;opacity:0.7;}#mermaid-svg-NgVuZR1w6VrznDR7 .pieTitleText{text-anchor:middle;font-size:25px;fill:black;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}#mermaid-svg-NgVuZR1w6VrznDR7 .slice{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;fill:#333;font-size:17px;}#mermaid-svg-NgVuZR1w6VrznDR7 .legend text{fill:black;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:17px;}#mermaid-svg-NgVuZR1w6VrznDR7 :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<h5>3.2 \u4e03\u9636\u6bb5\u6570\u636e\u8fc7\u6ee4<\/h5>\n<p>\u7cbe\u5bc6\u7684\u591a\u7ea7\u8fc7\u6ee4\u6d41\u7a0b&#xff1a;<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">multi_stage_filter<\/span><span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> texts<span class=\"token punctuation\">,<\/span> stage<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">if<\/span> stage <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u521d\u59cb\u6e05\u7406<\/span><br \/>\n        images <span class=\"token operator\">&#061;<\/span> filter_corrupted<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">)<\/span><br \/>\n        images <span class=\"token operator\">&#061;<\/span> filter_low_res<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> min_res<span class=\"token operator\">&#061;<\/span><span class=\"token number\">256<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        texts <span class=\"token operator\">&#061;<\/span> filter_invalid_text<span class=\"token punctuation\">(<\/span>texts<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> stage <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">2<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u8d28\u91cf\u4f18\u5316<\/span><br \/>\n        images <span class=\"token operator\">&#061;<\/span> filter_blur<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> threshold<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.7<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        images <span class=\"token operator\">&#061;<\/span> filter_low_entropy<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> threshold<span class=\"token operator\">&#061;<\/span><span class=\"token number\">2.0<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> stage <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">3<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u56fe\u6587\u5bf9\u9f50<\/span><br \/>\n        pairs <span class=\"token operator\">&#061;<\/span> clip_alignment<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> texts<span class=\"token punctuation\">,<\/span> threshold<span class=\"token operator\">&#061;<\/span><span class=\"token number\">0.85<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> stage <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token number\">4<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u6587\u672c\u6e32\u67d3\u589e\u5f3a<\/span><br \/>\n        images<span class=\"token punctuation\">,<\/span> texts <span class=\"token operator\">&#061;<\/span> augment_text_rendering<span class=\"token punctuation\">(<\/span>images<span class=\"token punctuation\">,<\/span> texts<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u9636\u6bb55-7\u5904\u7406\u9ad8\u5206\u8fa8\u7387\u5e73\u8861<\/span><br \/>\n    <span class=\"token keyword\">return<\/span> refined_data<\/p>\n<h5>3.3 \u7ed3\u6784\u5316\u6570\u636e\u6807\u6ce8<\/h5>\n<p>\u521b\u65b0\u6807\u6ce8\u6846\u67b6\u540c\u6b65\u751f\u6210\u63cf\u8ff0\u4e0e\u5143\u6570\u636e&#xff1a;<\/p>\n<p><span class=\"token punctuation\">{<\/span><br \/>\n  <span class=\"token string\">&#034;caption&#034;<\/span><span class=\"token operator\">:<\/span> <span class=\"token string\">&#034;\u5496\u5561\u9986\u5185\u666f&#xff0c;\u6728\u684c\u4e0a\u653e\u7740\u767d\u8272\u9a6c\u514b\u676f&#xff0c;\u676f\u8eab\u5370\u6709&#039;Hello World&#039;\u5b57\u6837&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n  <span class=\"token string\">&#034;image_type&#034;<\/span><span class=\"token operator\">:<\/span> <span class=\"token string\">&#034;\u5ba4\u5185\u6444\u5f71&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n  <span class=\"token string\">&#034;image_style&#034;<\/span><span class=\"token operator\">:<\/span> <span class=\"token string\">&#034;\u5199\u5b9e\u98ce\u683c&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n  <span class=\"token string\">&#034;watermark_list&#034;<\/span><span class=\"token operator\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n  <span class=\"token string\">&#034;abnormal_elements&#034;<\/span><span class=\"token operator\">:<\/span> <span class=\"token boolean\">false<\/span><br \/>\n<span class=\"token punctuation\">}<\/span><\/p>\n<h5>3.4 \u6587\u672c\u6e32\u67d3\u589e\u5f3a<\/h5>\n<p>\u4e09\u9636\u6bb5\u5408\u6210\u7b56\u7565\u89e3\u51b3\u4e2d\u6587\u957f\u5c3e\u5206\u5e03\u95ee\u9898&#xff1a;<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">render_text_synthesis<\/span><span class=\"token punctuation\">(<\/span>mode<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">,<\/span> background<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">if<\/span> mode <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;pure&#034;<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u7eaf\u6587\u672c\u6e32\u67d3<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> render_on_plain_bg<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">,<\/span> font_size<span class=\"token operator\">&#061;<\/span><span class=\"token number\">24<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> mode <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;composite&#034;<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u60c5\u5883\u5408\u6210<\/span><br \/>\n        canvas <span class=\"token operator\">&#061;<\/span> place_on_realistic_bg<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">,<\/span> background<span class=\"token punctuation\">)<\/span><br \/>\n        caption <span class=\"token operator\">&#061;<\/span> generate_context_caption<span class=\"token punctuation\">(<\/span>canvas<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> canvas<span class=\"token punctuation\">,<\/span> caption<\/p>\n<p>    <span class=\"token keyword\">elif<\/span> mode <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#034;complex&#034;<\/span><span class=\"token punctuation\">:<\/span>  <span class=\"token comment\"># \u590d\u6742\u5e03\u5c40<\/span><br \/>\n        template <span class=\"token operator\">&#061;<\/span> select_template<span class=\"token punctuation\">(<\/span><span class=\"token string\">&#034;ppt&#034;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> fill_template<span class=\"token punctuation\">(<\/span>template<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">)<\/span><\/p>\n<h4>4 \u8bad\u7ec3\u7b56\u7565\u8be6\u89e3<\/h4>\n<h5>4.1 \u9884\u8bad\u7ec3\u9636\u6bb5<\/h5>\n<p>\u91c7\u7528\u6d41\u5339\u914d\u76ee\u6807\u51fd\u6570&#xff1a; <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>          v<\/p>\n<p>          t<\/p>\n<p>         &#061;<\/p>\n<p>           d<\/p>\n<p>            x<\/p>\n<p>            t<\/p>\n<p>           d<\/p>\n<p>           t<\/p>\n<p>         &#061;<\/p>\n<p>          x<\/p>\n<p>          0<\/p>\n<p>         \u2212<\/p>\n<p>          x<\/p>\n<p>          1<\/p>\n<p>        v_t &#061; \\\\frac{dx_t}{dt} &#061; x_0 &#8211; x_1<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.5806em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 2.0574em;vertical-align: -0.686em\"><\/span><span class=\"mord\"><span class=\"mopen nulldelimiter\"><\/span><span class=\"mfrac\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 1.3714em\"><span class=\"\" style=\"top: -2.314em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"mord mathnormal\">t<\/span><\/span><\/span><span class=\"\" style=\"top: -3.23em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"frac-line\" style=\"border-bottom-width: 0.04em\"><\/span><\/span><span class=\"\" style=\"top: -3.677em\"><span class=\"pstrut\" style=\"height: 3em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">d<\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.686em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><span class=\"mclose nulldelimiter\"><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.7333em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">0<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u2212<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 0.5806em;vertical-align: -0.15em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3011em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">1<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span> <span class=\"katex--display\"><span class=\"katex-display\"><span class=\"katex\"><span class=\"katex-mathml\"><\/p>\n<p>         L<\/p>\n<p>         &#061;<\/p>\n<p>         E<\/p>\n<p>         \u2223<\/p>\n<p>         \u2223<\/p>\n<p>          v<\/p>\n<p>          \u03b8<\/p>\n<p>         (<\/p>\n<p>          x<\/p>\n<p>          t<\/p>\n<p>         ,<\/p>\n<p>         t<\/p>\n<p>         ,<\/p>\n<p>         h<\/p>\n<p>         )<\/p>\n<p>         \u2212<\/p>\n<p>          v<\/p>\n<p>          t<\/p>\n<p>         \u2223<\/p>\n<p>          \u2223<\/p>\n<p>          2<\/p>\n<p>        \\\\mathcal{L} &#061; \\\\mathbb{E}||v_\\\\theta(x_t,t,h) &#8211; v_t||^2<\/p>\n<p>     <\/span><span class=\"katex-html\"><span class=\"base\"><span class=\"strut\" style=\"height: 0.6833em\"><\/span><span class=\"mord mathcal\">L<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><span class=\"mrel\">&#061;<\/span><span class=\"mspace\" style=\"margin-right: 0.2778em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1em;vertical-align: -0.25em\"><\/span><span class=\"mord mathbb\">E<\/span><span class=\"mord\">\u2223\u2223<\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.3361em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\" style=\"margin-right: 0.0278em\">\u03b8<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mopen\">(<\/span><span class=\"mord\"><span class=\"mord mathnormal\">x<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: 0em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">t<\/span><span class=\"mpunct\">,<\/span><span class=\"mspace\" style=\"margin-right: 0.1667em\"><\/span><span class=\"mord mathnormal\">h<\/span><span class=\"mclose\">)<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><span class=\"mbin\">\u2212<\/span><span class=\"mspace\" style=\"margin-right: 0.2222em\"><\/span><\/span><span class=\"base\"><span class=\"strut\" style=\"height: 1.1141em;vertical-align: -0.25em\"><\/span><span class=\"mord\"><span class=\"mord mathnormal\" style=\"margin-right: 0.0359em\">v<\/span><span class=\"msupsub\"><span class=\"vlist-t vlist-t2\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.2806em\"><span class=\"\" style=\"top: -2.55em;margin-left: -0.0359em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mathnormal mtight\">t<\/span><\/span><\/span><\/span><span class=\"vlist-s\">\u200b<\/span><\/span><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.15em\"><span class=\"\"><\/span><\/span><\/span><\/span><\/span><\/span><span class=\"mord\">\u2223<\/span><span class=\"mord\"><span class=\"mord\">\u2223<\/span><span class=\"msupsub\"><span class=\"vlist-t\"><span class=\"vlist-r\"><span class=\"vlist\" style=\"height: 0.8641em\"><span class=\"\" style=\"top: -3.113em;margin-right: 0.05em\"><span class=\"pstrut\" style=\"height: 2.7em\"><\/span><span class=\"sizing reset-size6 size3 mtight\"><span class=\"mord mtight\">2<\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/span><\/p>\n<p>4.1.1 \u751f\u4ea7\u8005-\u6d88\u8d39\u8005\u6846\u67b6<\/p>\n<p><span class=\"token comment\"># \u751f\u4ea7\u8005\u8282\u70b9<\/span><br \/>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">producer_process<\/span><span class=\"token punctuation\">(<\/span>data_queue<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">while<\/span> <span class=\"token boolean\">True<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        raw_data <span class=\"token operator\">&#061;<\/span> load_raw_data<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        filtered <span class=\"token operator\">&#061;<\/span> stage_filter<span class=\"token punctuation\">(<\/span>raw_data<span class=\"token punctuation\">,<\/span> current_stage<span class=\"token punctuation\">)<\/span><br \/>\n        processed <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n            <span class=\"token string\">&#039;h&#039;<\/span><span class=\"token punctuation\">:<\/span> mllm<span class=\"token punctuation\">.<\/span>encode<span class=\"token punctuation\">(<\/span>filtered<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;text&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            <span class=\"token string\">&#039;z&#039;<\/span><span class=\"token punctuation\">:<\/span> vae<span class=\"token punctuation\">.<\/span>encode<span class=\"token punctuation\">(<\/span>filtered<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;image&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token punctuation\">}<\/span><br \/>\n        data_queue<span class=\"token punctuation\">.<\/span>put<span class=\"token punctuation\">(<\/span>processed<span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># \u6d88\u8d39\u8005\u8282\u70b9<\/span><br \/>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">consumer_process<\/span><span class=\"token punctuation\">(<\/span>data_queue<span class=\"token punctuation\">,<\/span> model<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">while<\/span> <span class=\"token boolean\">True<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        batch <span class=\"token operator\">&#061;<\/span> data_queue<span class=\"token punctuation\">.<\/span>get_batch<span class=\"token punctuation\">(<\/span><span class=\"token number\">32<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        loss <span class=\"token operator\">&#061;<\/span> model<span class=\"token punctuation\">(<\/span>batch<span class=\"token punctuation\">)<\/span><br \/>\n        loss<span class=\"token punctuation\">.<\/span>backward<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        optimizer<span class=\"token punctuation\">.<\/span>step<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>4.1.2 \u5206\u5e03\u5f0f\u8bad\u7ec3\u4f18\u5316 \u5173\u952e\u6280\u672f\u521b\u65b0&#xff1a;<\/p>\n<ul>\n<li>4\u8def\u5f20\u91cf\u5e76\u884c<\/li>\n<li>bfloat16\u68af\u5ea6\u805a\u5408&#043;float32\u5f52\u7ea6<\/li>\n<li>\u7981\u7528\u6fc0\u6d3b\u68c0\u67e5\u70b9&#xff08;\u8282\u770111.3%\u663e\u5b58&#xff09;<\/li>\n<\/ul>\n<p>4.1.3 \u6e10\u8fdb\u5f0f\u8bad\u7ec3\u7b56\u7565 \u4e94\u7ef4\u6e10\u8fdb\u5b66\u4e60&#xff1a;<\/p>\n<p>training_strategy <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n    <span class=\"token string\">&#039;resolution&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;256&#215;256&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;640&#215;640&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;1328&#215;1328&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    <span class=\"token string\">&#039;text_rendering&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;none&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;single&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;paragraph&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    <span class=\"token string\">&#039;data_quality&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;mass&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;curated&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    <span class=\"token string\">&#039;data_balance&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;imbalanced&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;balanced&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n    <span class=\"token string\">&#039;data_source&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;real&#039;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;synthetic&#039;<\/span><span class=\"token punctuation\">]<\/span><br \/>\n<span class=\"token punctuation\">}<\/span><\/p>\n<h5>4.2 \u540e\u8bad\u7ec3\u4f18\u5316<\/h5>\n<p>4.2.1 \u76d1\u7763\u5fae\u8c03(SFT) \u6784\u5efa\u5206\u5c42\u8bed\u4e49\u6570\u636e\u96c6&#xff1a;<\/p>\n<ul>\n<li>\u7cbe\u9009\u9ad8\u6e05\u6670\u5ea6\u3001\u7ec6\u8282\u4e30\u5bcc\u7684\u56fe\u50cf<\/li>\n<li>\u4eba\u5de5\u6807\u6ce8\u7a81\u51fa\u5173\u952e\u89c6\u89c9\u7279\u5f81<\/li>\n<li>\u5e73\u8861\u827a\u672f\u98ce\u683c\u4e0e\u771f\u5b9e\u611f\u6837\u672c<\/li>\n<\/ul>\n<p>4.2.2 \u5f3a\u5316\u5b66\u4e60\u4f18\u5316 \u878d\u5408DPO\u4e0eGRPO&#xff1a;<\/p>\n<p><span class=\"token comment\"># DPO\u635f\u5931\u51fd\u6570<\/span><br \/>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">dpo_loss<\/span><span class=\"token punctuation\">(<\/span>policy_model<span class=\"token punctuation\">,<\/span> ref_model<span class=\"token punctuation\">,<\/span> win_data<span class=\"token punctuation\">,<\/span> lose_data<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    t <span class=\"token operator\">&#061;<\/span> uniform_sample<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1<\/span><span class=\"token punctuation\">)<\/span><br \/>\n    x_t_win <span class=\"token operator\">&#061;<\/span> t <span class=\"token operator\">*<\/span> win_data<span class=\"token punctuation\">.<\/span>x0 <span class=\"token operator\">&#043;<\/span> <span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token operator\">&#8211;<\/span>t<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">*<\/span> noise<br \/>\n    v_t_win <span class=\"token operator\">&#061;<\/span> win_data<span class=\"token punctuation\">.<\/span>x0 <span class=\"token operator\">&#8211;<\/span> noise<\/p>\n<p>    diff_policy <span class=\"token operator\">&#061;<\/span> policy_model<span class=\"token punctuation\">(<\/span>x_t_win<span class=\"token punctuation\">,<\/span> t<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>mse<span class=\"token punctuation\">(<\/span>v_t_win<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&#8211;<\/span> policy_model<span class=\"token punctuation\">(<\/span>x_t_lose<span class=\"token punctuation\">,<\/span> t<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>mse<span class=\"token punctuation\">(<\/span>v_t_lose<span class=\"token punctuation\">)<\/span><br \/>\n    diff_ref <span class=\"token operator\">&#061;<\/span> ref_model<span class=\"token punctuation\">(<\/span>x_t_win<span class=\"token punctuation\">,<\/span> t<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>mse<span class=\"token punctuation\">(<\/span>v_t_win<span class=\"token punctuation\">)<\/span> <span class=\"token operator\">&#8211;<\/span> ref_model<span class=\"token punctuation\">(<\/span>x_t_lose<span class=\"token punctuation\">,<\/span> t<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">.<\/span>mse<span class=\"token punctuation\">(<\/span>v_t_lose<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">return<\/span> <span class=\"token operator\">&#8211;<\/span>torch<span class=\"token punctuation\">.<\/span>log<span class=\"token punctuation\">(<\/span>torch<span class=\"token punctuation\">.<\/span>sigmoid<span class=\"token punctuation\">(<\/span><span class=\"token operator\">&#8211;<\/span>beta<span class=\"token operator\">*<\/span><span class=\"token punctuation\">(<\/span>diff_policy <span class=\"token operator\">&#8211;<\/span> diff_ref<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token comment\"># GRPO\u8bad\u7ec3\u6d41\u7a0b<\/span><br \/>\n<span class=\"token keyword\">def<\/span> <span class=\"token function\">grpo_train<\/span><span class=\"token punctuation\">(<\/span>policy<span class=\"token punctuation\">,<\/span> ref_model<span class=\"token punctuation\">,<\/span> prompt<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    trajectories <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span>policy<span class=\"token punctuation\">.<\/span>generate<span class=\"token punctuation\">(<\/span>prompt<span class=\"token punctuation\">)<\/span> <span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token number\">8<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span><br \/>\n    rewards <span class=\"token operator\">&#061;<\/span> reward_model<span class=\"token punctuation\">(<\/span>trajectories<span class=\"token punctuation\">)<\/span><br \/>\n    advantages <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">(<\/span>rewards <span class=\"token operator\">&#8211;<\/span> rewards<span class=\"token punctuation\">.<\/span>mean<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> rewards<span class=\"token punctuation\">.<\/span>std<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">for<\/span> traj <span class=\"token keyword\">in<\/span> trajectories<span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> t <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span><span class=\"token builtin\">len<\/span><span class=\"token punctuation\">(<\/span>traj<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            ratio <span class=\"token operator\">&#061;<\/span> policy_prob<span class=\"token punctuation\">(<\/span>traj<span class=\"token punctuation\">[<\/span>t<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span> <span class=\"token operator\">\/<\/span> ref_model_prob<span class=\"token punctuation\">(<\/span>traj<span class=\"token punctuation\">[<\/span>t<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            kl_div <span class=\"token operator\">&#061;<\/span> compute_kl<span class=\"token punctuation\">(<\/span>policy<span class=\"token punctuation\">,<\/span> ref_model<span class=\"token punctuation\">,<\/span> traj<span class=\"token punctuation\">[<\/span>t<span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            loss <span class=\"token operator\">&#061;<\/span> <span class=\"token builtin\">min<\/span><span class=\"token punctuation\">(<\/span>ratio<span class=\"token operator\">*<\/span>advantages<span class=\"token punctuation\">,<\/span> clip<span class=\"token punctuation\">(<\/span>ratio<span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.8<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1.2<\/span><span class=\"token punctuation\">)<\/span><span class=\"token operator\">*<\/span>advantages <span class=\"token operator\">&#8211;<\/span> beta<span class=\"token operator\">*<\/span>kl_div<br \/>\n            loss<span class=\"token punctuation\">.<\/span>backward<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<h5>4.3 \u591a\u4efb\u52a1\u8bad\u7ec3\u6269\u5c55<\/h5>\n<p>\u7edf\u4e00\u5904\u7406\u6846\u67b6\u652f\u6301&#xff1a;<\/p>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">multitask_forward<\/span><span class=\"token punctuation\">(<\/span>task_type<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">,<\/span> image<span class=\"token operator\">&#061;<\/span><span class=\"token boolean\">None<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">if<\/span> task_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#039;T2I&#039;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> model<span class=\"token punctuation\">.<\/span>generate_from_text<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> task_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#039;TI2I&#039;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        h_image <span class=\"token operator\">&#061;<\/span> model<span class=\"token punctuation\">.<\/span>encode_image<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> model<span class=\"token punctuation\">.<\/span>edit_image<span class=\"token punctuation\">(<\/span>text<span class=\"token punctuation\">,<\/span> h_image<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> task_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#039;depth_estimation&#039;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> model<span class=\"token punctuation\">.<\/span>predict_depth<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">elif<\/span> task_type <span class=\"token operator\">&#061;&#061;<\/span> <span class=\"token string\">&#039;novel_view&#039;<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">return<\/span> model<span class=\"token punctuation\">.<\/span>render_new_view<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">,<\/span> text<span class=\"token punctuation\">)<\/span><\/p>\n<h4>5 \u5b9e\u9a8c\u8bc4\u4f30<\/h4>\n<h5>5.1 \u4eba\u7c7b\u8bc4\u4f30&#xff08;AI\u7ade\u6280\u573a&#xff09;<\/h5>\n<p>\u5efa\u7acb\u5f00\u653e\u8bc4\u4f30\u5e73\u53f0&#xff1a;<\/p>\n<p>  #mermaid-svg-PLgN2WOAD9AKTmOV {font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .error-icon{fill:#552222;}#mermaid-svg-PLgN2WOAD9AKTmOV .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-PLgN2WOAD9AKTmOV .edge-thickness-normal{stroke-width:2px;}#mermaid-svg-PLgN2WOAD9AKTmOV .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-PLgN2WOAD9AKTmOV .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-PLgN2WOAD9AKTmOV .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-PLgN2WOAD9AKTmOV .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-PLgN2WOAD9AKTmOV .marker{fill:#333333;stroke:#333333;}#mermaid-svg-PLgN2WOAD9AKTmOV .marker.cross{stroke:#333333;}#mermaid-svg-PLgN2WOAD9AKTmOV svg{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-PLgN2WOAD9AKTmOV .label{font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;color:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .cluster-label text{fill:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .cluster-label span{color:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .label text,#mermaid-svg-PLgN2WOAD9AKTmOV span{fill:#333;color:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .node rect,#mermaid-svg-PLgN2WOAD9AKTmOV .node circle,#mermaid-svg-PLgN2WOAD9AKTmOV .node ellipse,#mermaid-svg-PLgN2WOAD9AKTmOV .node polygon,#mermaid-svg-PLgN2WOAD9AKTmOV .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-PLgN2WOAD9AKTmOV .node .label{text-align:center;}#mermaid-svg-PLgN2WOAD9AKTmOV .node.clickable{cursor:pointer;}#mermaid-svg-PLgN2WOAD9AKTmOV .arrowheadPath{fill:#333333;}#mermaid-svg-PLgN2WOAD9AKTmOV .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-PLgN2WOAD9AKTmOV .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-PLgN2WOAD9AKTmOV .edgeLabel{background-color:#e8e8e8;text-align:center;}#mermaid-svg-PLgN2WOAD9AKTmOV .edgeLabel rect{opacity:0.5;background-color:#e8e8e8;fill:#e8e8e8;}#mermaid-svg-PLgN2WOAD9AKTmOV .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-PLgN2WOAD9AKTmOV .cluster text{fill:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV .cluster span{color:#333;}#mermaid-svg-PLgN2WOAD9AKTmOV div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-PLgN2WOAD9AKTmOV :root{&#8211;mermaid-font-family:\\&#8221;trebuchet ms\\&#8221;,verdana,arial,sans-serif;}<\/p>\n<p>         <span id=\"L-L-A-B\" class=\"edgeLabel L-LS-A&#039; L-LE-B\"><\/span><\/p>\n<p>         <span id=\"L-L-A-C\" class=\"edgeLabel L-LS-A&#039; L-LE-C\"><\/span><\/p>\n<p>         <span id=\"L-L-B-D\" class=\"edgeLabel L-LS-B&#039; L-LE-D\"><\/span><\/p>\n<p>         <span id=\"L-L-C-D\" class=\"edgeLabel L-LS-C&#039; L-LE-D\"><\/span><\/p>\n<p>         <span id=\"L-L-D-E\" class=\"edgeLabel L-LS-D&#039; L-LE-E\"><\/span><\/p>\n<p>          \u751f\u6210\u63d0\u793a\u8bcd<\/p>\n<p>          \u6a21\u578bA\u533f\u540d\u751f\u6210<\/p>\n<p>          \u6a21\u578bB\u533f\u540d\u751f\u6210<\/p>\n<p>          \u4eba\u7c7b\u8bc4\u4f30<\/p>\n<p>          Elo\u8bc4\u5206\u66f4\u65b0<\/p>\n<p>Qwen-Image\u5728\u4e3b\u6d41\u95ed\u6e90\u6a21\u578b\u4e2d\u6392\u540d\u7b2c\u4e8c&#xff1a;<\/p>\n<table>\n<tr>\u6a21\u578bElo\u8bc4\u5206<\/tr>\n<tbody>\n<tr>\n<td>Imagen 4 Ultra<\/td>\n<td>1250<\/td>\n<\/tr>\n<tr>\n<td>Qwen-Image<\/td>\n<td>1220<\/td>\n<\/tr>\n<tr>\n<td>GPT Image 1<\/td>\n<td>1190<\/td>\n<\/tr>\n<tr>\n<td>FLUX.1 Kontext<\/td>\n<td>1185<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h5>5.2 \u5b9a\u91cf\u5206\u6790<\/h5>\n<p>5.2.1 VAE\u91cd\u5efa\u6027\u80fd \u5728\u6587\u672c\u91cd\u5efa\u4efb\u52a1\u4e2d\u663e\u8457\u9886\u5148&#xff1a;<\/p>\n<p>| \u6a21\u578b            | Text PSNR | SSIM   |<br \/>\n|&#8212;&#8212;&#8212;&#8212;&#8212;&#8211;|&#8212;&#8212;&#8212;&#8211;|&#8212;&#8212;&#8211;|<br \/>\n| Wan2.1-VAE      | 26.77     | 0.9386 |<br \/>\n| FLUX-VAE        | 32.65     | 0.9792 |<br \/>\n| Qwen-Image-VAE  | 36.63     | 0.9839 |<\/p>\n<p>5.2.2 \u6587\u672c\u751f\u6210\u6027\u80fd \u4e2d\u6587\u6e32\u67d3\u78be\u538b\u7ea7\u4f18\u52bf&#xff1a;<\/p>\n<p>| \u6a21\u578b          | \u4e00\u7ea7\u6c49\u5b57 | \u4e8c\u7ea7\u6c49\u5b57 | \u4e09\u7ea7\u6c49\u5b57 |<br \/>\n|&#8212;&#8212;&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;|&#8212;&#8212;&#8212;|<br \/>\n| Seedream 3.0  | 53.48   | 26.23   | 1.25    |<br \/>\n| GPT Image 1   | 68.37   | 15.97   | 3.55    |<br \/>\n| Qwen-Image    | 97.29   | 40.53   | 6.48    |<\/p>\n<h5>5.3 \u5b9a\u6027\u5206\u6790<\/h5>\n<p>5.3.1 \u4e2d\u6587\u6587\u672c\u6e32\u67d3 <img decoding=\"async\" src=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/08\/20250814171451-689e198be42a4.png\" alt=\"\u5728\u8fd9\u91cc\u63d2\u5165\u56fe\u7247\u63cf\u8ff0\" \/><\/p>\n<p>\u56fe3&#xff1a;Qwen-Image\u7cbe\u51c6\u751f\u6210\u590d\u6742\u4e2d\u6587\u5bf9\u8054<\/p>\n<p>5.3.2 \u591a\u5bf9\u8c61\u751f\u6210<\/p>\n<p>\u63d0\u793a\u8bcd&#xff1a;\u201c\u5341\u4e8c\u751f\u8096\u6bdb\u7ed2\u73a9\u5177\u6574\u9f50\u6392\u5217&#xff0c;\u4e09\u884c\u56db\u5217\u201d<\/p>\n<p>Qwen-Image\u552f\u4e00\u6b63\u786e\u751f\u6210\u6240\u670912\u4e2a\u751f\u8096\u5f62\u8c61\u5e76\u4fdd\u6301\u6750\u8d28\u4e00\u81f4\u6027\u3002<\/p>\n<h4>6 \u4ee3\u7801\u5b9e\u73b0\u5173\u952e\u6a21\u5757<\/h4>\n<h5>6.1 \u6d41\u5339\u914d\u8bad\u7ec3\u6838\u5fc3<\/h5>\n<p><span class=\"token keyword\">def<\/span> <span class=\"token function\">flow_matching_loss<\/span><span class=\"token punctuation\">(<\/span>model<span class=\"token punctuation\">,<\/span> x0<span class=\"token punctuation\">,<\/span> h<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token comment\"># \u91c7\u6837\u968f\u673a\u566a\u58f0\u548c\u65f6\u95f4\u6b65<\/span><br \/>\n    x1 <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>randn_like<span class=\"token punctuation\">(<\/span>x0<span class=\"token punctuation\">)<\/span><br \/>\n    t <span class=\"token operator\">&#061;<\/span> torch<span class=\"token punctuation\">.<\/span>rand<span class=\"token punctuation\">(<\/span>x0<span class=\"token punctuation\">.<\/span>size<span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span> device<span class=\"token operator\">&#061;<\/span>x0<span class=\"token punctuation\">.<\/span>device<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token comment\"># \u8ba1\u7b97\u4e2d\u95f4\u72b6\u6001<\/span><br \/>\n    xt <span class=\"token operator\">&#061;<\/span> t<span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">*<\/span> x0 <span class=\"token operator\">&#043;<\/span> <span class=\"token punctuation\">(<\/span><span class=\"token number\">1<\/span><span class=\"token operator\">&#8211;<\/span>t<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">[<\/span><span class=\"token punctuation\">:<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token boolean\">None<\/span><span class=\"token punctuation\">]<\/span> <span class=\"token operator\">*<\/span> x1<br \/>\n    vt <span class=\"token operator\">&#061;<\/span> x0 <span class=\"token operator\">&#8211;<\/span> x1<\/p>\n<p>    <span class=\"token comment\"># \u6a21\u578b\u9884\u6d4b<\/span><br \/>\n    v_pred <span class=\"token operator\">&#061;<\/span> model<span class=\"token punctuation\">(<\/span>xt<span class=\"token punctuation\">,<\/span> t<span class=\"token punctuation\">,<\/span> h<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">return<\/span> F<span class=\"token punctuation\">.<\/span>mse_loss<span class=\"token punctuation\">(<\/span>v_pred<span class=\"token punctuation\">,<\/span> vt<span class=\"token punctuation\">)<\/span><\/p>\n<h5>6.2 \u751f\u4ea7\u8005-\u6d88\u8d39\u8005\u6846\u67b6<\/h5>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">DataPipeline<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> num_producers<span class=\"token operator\">&#061;<\/span><span class=\"token number\">8<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>queue <span class=\"token operator\">&#061;<\/span> PriorityQueue<span class=\"token punctuation\">(<\/span>maxsize<span class=\"token operator\">&#061;<\/span><span class=\"token number\">1000<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>producers <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">[<\/span>Producer<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>queue<span class=\"token punctuation\">)<\/span> <span class=\"token keyword\">for<\/span> _ <span class=\"token keyword\">in<\/span> <span class=\"token builtin\">range<\/span><span class=\"token punctuation\">(<\/span>num_producers<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">]<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>consumer <span class=\"token operator\">&#061;<\/span> Consumer<span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">.<\/span>queue<span class=\"token punctuation\">)<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">start<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">for<\/span> p <span class=\"token keyword\">in<\/span> self<span class=\"token punctuation\">.<\/span>producers<span class=\"token punctuation\">:<\/span><br \/>\n            p<span class=\"token punctuation\">.<\/span>start<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>consumer<span class=\"token punctuation\">.<\/span>start<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">Producer<\/span><span class=\"token punctuation\">(<\/span>Thread<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">run<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        <span class=\"token keyword\">while<\/span> <span class=\"token boolean\">True<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            data <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>load_next_batch<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            processed <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>process<span class=\"token punctuation\">(<\/span>data<span class=\"token punctuation\">)<\/span><br \/>\n            self<span class=\"token punctuation\">.<\/span>queue<span class=\"token punctuation\">.<\/span>put<span class=\"token punctuation\">(<\/span>processed<span class=\"token punctuation\">,<\/span> priority<span class=\"token operator\">&#061;<\/span>data<span class=\"token punctuation\">[<\/span><span class=\"token string\">&#039;quality_score&#039;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">)<\/span><\/p>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">Consumer<\/span><span class=\"token punctuation\">(<\/span>Thread<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">run<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        model <span class=\"token operator\">&#061;<\/span> build_model<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><br \/>\n        <span class=\"token keyword\">while<\/span> <span class=\"token boolean\">True<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            batch <span class=\"token operator\">&#061;<\/span> self<span class=\"token punctuation\">.<\/span>queue<span class=\"token punctuation\">.<\/span>get_batch<span class=\"token punctuation\">(<\/span><span class=\"token number\">32<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            loss <span class=\"token operator\">&#061;<\/span> model<span class=\"token punctuation\">.<\/span>train_step<span class=\"token punctuation\">(<\/span>batch<span class=\"token punctuation\">)<\/span><br \/>\n            update_model<span class=\"token punctuation\">(<\/span>loss<span class=\"token punctuation\">)<\/span><\/p>\n<h5>6.3 \u6e10\u8fdb\u5f0f\u8bad\u7ec3\u8c03\u5ea6\u5668<\/h5>\n<p><span class=\"token keyword\">class<\/span> <span class=\"token class-name\">ProgressiveScheduler<\/span><span class=\"token punctuation\">:<\/span><br \/>\n    <span class=\"token keyword\">def<\/span> <span class=\"token function\">__init__<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> total_steps<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        self<span class=\"token punctuation\">.<\/span>stage_schedule <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><br \/>\n            <span class=\"token string\">&#039;resolution&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.3<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">256<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0.3<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.7<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">640<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0.7<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1.0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1328<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n            <span class=\"token string\">&#039;text_complexity&#039;<\/span><span class=\"token punctuation\">:<\/span> <span class=\"token punctuation\">[<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.4<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;word&#039;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0.4<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">0.8<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;line&#039;<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">,<\/span><br \/>\n                <span class=\"token punctuation\">(<\/span><span class=\"token number\">0.8<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token number\">1.0<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#039;paragraph&#039;<\/span><span class=\"token punctuation\">)<\/span><br \/>\n            <span class=\"token punctuation\">]<\/span><br \/>\n        <span class=\"token punctuation\">}<\/span><\/p>\n<p>    <span class=\"token keyword\">def<\/span> <span class=\"token function\">update<\/span><span class=\"token punctuation\">(<\/span>self<span class=\"token punctuation\">,<\/span> step<span class=\"token punctuation\">,<\/span> total<span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n        progress <span class=\"token operator\">&#061;<\/span> step <span class=\"token operator\">\/<\/span> total<br \/>\n        config <span class=\"token operator\">&#061;<\/span> <span class=\"token punctuation\">{<\/span><span class=\"token punctuation\">}<\/span><\/p>\n<p>        <span class=\"token keyword\">for<\/span> key<span class=\"token punctuation\">,<\/span> stages <span class=\"token keyword\">in<\/span> self<span class=\"token punctuation\">.<\/span>stage_schedule<span class=\"token punctuation\">.<\/span>items<span class=\"token punctuation\">(<\/span><span class=\"token punctuation\">)<\/span><span class=\"token punctuation\">:<\/span><br \/>\n            <span class=\"token keyword\">for<\/span> start<span class=\"token punctuation\">,<\/span> end<span class=\"token punctuation\">,<\/span> value <span class=\"token keyword\">in<\/span> stages<span class=\"token punctuation\">:<\/span><br \/>\n                <span class=\"token keyword\">if<\/span> start <span class=\"token operator\">&lt;&#061;<\/span> progress <span class=\"token operator\">&lt;<\/span> end<span class=\"token punctuation\">:<\/span><br \/>\n                    config<span class=\"token punctuation\">[<\/span>key<span class=\"token punctuation\">]<\/span> <span class=\"token operator\">&#061;<\/span> value<br \/>\n                    <span class=\"token keyword\">break<\/span><\/p>\n<p>        <span class=\"token keyword\">return<\/span> config<\/p>\n<h4>7 \u5e94\u7528\u573a\u666f\u4e0e\u672a\u6765\u65b9\u5411<\/h4>\n<h5>7.1 \u4ea7\u4e1a\u5e94\u7528\u4ef7\u503c<\/h5>\n<li>\u8bbe\u8ba1\u9886\u57df&#xff1a;\u81ea\u52a8\u751f\u6210\u6d77\u62a5\u3001UI\u754c\u9762generate_poster<span class=\"token punctuation\">(<\/span>title<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;\u590f\u5b63\u4fc3\u9500&#034;<\/span><span class=\"token punctuation\">,<\/span><br \/>\n               elements<span class=\"token operator\">&#061;<\/span><span class=\"token punctuation\">[<\/span><span class=\"token string\">&#034;\u6298\u6263\u6807\u7b7e&#034;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u4ea7\u54c1\u56fe\u7247&#034;<\/span><span class=\"token punctuation\">,<\/span> <span class=\"token string\">&#034;\u8054\u7cfb\u65b9\u5f0f&#034;<\/span><span class=\"token punctuation\">]<\/span><span class=\"token punctuation\">,<\/span><br \/>\n               style<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;\u73b0\u4ee3\u7b80\u7ea6&#034;<\/span><span class=\"token punctuation\">)<\/span>\n <\/li>\n<li>\u7535\u5b50\u5546\u52a1&#xff1a;\u591a\u8bed\u8a00\u5546\u54c1\u56fe\u6587\u751f\u6210<\/li>\n<li>\u6559\u80b2\u5185\u5bb9&#xff1a;\u56fe\u6587\u5e76\u8302\u7684\u6559\u5b66\u6750\u6599<\/li>\n<h5>7.2 \u6280\u672f\u6f14\u8fdb\u65b9\u5411<\/h5>\n<li>\u89c6\u9891\u751f\u6210\u6269\u5c55&#xff1a;\u5229\u7528VAE\u89c6\u9891\u89e3\u7801\u5668<\/li>\n<li>3D\u751f\u6210\u7edf\u4e00&#xff1a;\u65b0\u89c6\u89d2\u5408\u6210\u6280\u672f\u5ef6\u4f38generate_3d_asset<span class=\"token punctuation\">(<\/span>image<span class=\"token punctuation\">,<\/span> prompt<span class=\"token operator\">&#061;<\/span><span class=\"token string\">&#034;\u751f\u6210360\u5ea6\u65cb\u8f6c\u89c6\u56fe&#034;<\/span><span class=\"token punctuation\">)<\/span>\n <\/li>\n<li>\u611f\u77e5-\u751f\u6210\u4e00\u4f53\u5316&#xff1a;\u4e0eQwen-VL\u89c6\u89c9\u7406\u89e3\u6a21\u578b\u878d\u5408<\/li>\n<p>\u521b\u65b0\u542f\u793a&#xff1a;Qwen-Image\u7684\u6280\u672f\u7a81\u7834\u8bc1\u660e&#xff0c;\u751f\u6210\u6a21\u578b\u53ef\u901a\u8fc7\u5bf9\u5e95\u5c42\u5206\u5e03\u7684\u5efa\u6a21\u5b9e\u73b0\u6df1\u5ea6\u7406\u89e3&#xff0c;\u8fd9\u6a21\u7cca\u4e86\u751f\u6210\u4e0e\u7406\u89e3\u7684\u754c\u9650&#xff0c;\u4e3a\u591a\u6a21\u6001AGI\u5f00\u8f9f\u4e86\u65b0\u8def\u5f84\u3002<\/p>\n<h4>\u7ed3\u8bba<\/h4>\n<p>Qwen-Image\u901a\u8fc7\u4e09\u9879\u6839\u672c\u6027\u521b\u65b0\u91cd\u5851\u56fe\u50cf\u751f\u6210\u8303\u5f0f&#xff1a;<\/p>\n<li>\u6587\u672c\u6e32\u67d3\u9769\u65b0&#xff1a;\u6e10\u8fdb\u5f0f\u8bad\u7ec3\u7b56\u7565\u653b\u514b\u8868\u610f\u6587\u5b57\u96be\u9898<\/li>\n<li>\u7f16\u8f91\u4e00\u81f4\u6027\u7a81\u7834&#xff1a;\u53cc\u7f16\u7801\u673a\u5236\u5e73\u8861\u8bed\u4e49\u4e0e\u89c6\u89c9<\/li>\n<li>\u7edf\u4e00\u67b6\u6784\u8bbe\u8ba1&#xff1a;MMDiT\u9aa8\u5e72\u7f51\u7edc\u652f\u6301\u591a\u4efb\u52a1\u751f\u6210<\/li>\n<p>\u5176\u6280\u672f\u5f71\u54cd\u5df2\u8d85\u8d8a\u56fe\u50cf\u751f\u6210\u9886\u57df&#xff0c;\u9884\u793a\u4e86\u4e09\u4e2a\u672a\u6765\u65b9\u5411&#xff1a;<\/p>\n<li>\u751f\u6210\u5373\u7406\u89e3&#xff1a;\u901a\u8fc7\u751f\u6210\u8fc7\u7a0b\u5b9e\u73b0\u6df1\u5ea6\u89c6\u89c9\u7406\u89e3<\/li>\n<li>\u591a\u6a21\u6001\u7edf\u4e00&#xff1a;\u6587\u672c\u3001\u56fe\u50cf\u3001\u89c6\u9891\u7684\u8054\u5408\u5efa\u6a21<\/li>\n<li>\u4ea4\u4e92\u8303\u5f0f\u53d8\u9769&#xff1a;\u89c6\u89c9\u8bed\u8a00\u754c\u9762(VLI)\u53d6\u4ee3\u4f20\u7edfGUI<\/li>\n<p>\u968f\u7740\u4ee3\u7801\u5f00\u6e90&#xff08;GitHub\u94fe\u63a5\u548c\u6a21\u578b\u6743\u91cd\u91ca\u653e&#xff0c;Qwen-Image\u5c06\u6210\u4e3a\u751f\u6210\u5f0fAI\u53d1\u5c55\u7684\u91cd\u8981\u91cc\u7a0b\u7891\u3002<\/p>\n<hr \/>\n<p>\u53c2\u8003\u6587\u732e&#xff1a;<\/p>\n<li>Qwen-Image Technical Report<\/li>\n<li>Esser P, et al. \u201cScaling Rectified Flow Transformers for High-Resolution Image Synthesis\u201d ICML 2024<\/li>\n<li>HuggingFace Diffusers Library<\/li>\n<li>MMDiT: Multi-Modal Diffusion Transformer<\/li>\n","protected":false},"excerpt":{"rendered":"<p>\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb7k\u6b21\uff0c\u70b9\u8d5e26\u6b21\uff0c\u6536\u85cf20\u6b21\u3002Qwen-Image\u662f\u4e00\u6b3e\u7a81\u7834\u6027\u7684\u89c6\u89c9\u751f\u6210\u6a21\u578b\uff0c\u9488\u5bf9\u6587\u672c\u6e32\u67d3\u548c\u56fe\u50cf\u7f16\u8f91\u4e24\u5927\u6838\u5fc3\u6311\u6218\u63d0\u51fa\u521b\u65b0\u89e3\u51b3\u65b9\u6848\u3002\u8be5\u6a21\u578b\u91c7\u7528\u6e10\u8fdb\u5f0f\u6587\u672c\u6e32\u67d3\u8bad\u7ec3\u548c\u53cc\u7f16\u7801\u673a\u5236\uff08Qwen2.5-VL\u8bed\u4e49\u7f16\u7801+VAE\u89c6\u89c9\u7f16\u7801\uff09\uff0c\u901a\u8fc7\u591a\u4efb\u52a1\u7edf\u4e00\u67b6\u6784\u5b9e\u73b0\u6587\u672c\u5230\u56fe\u50cf\u3001\u56fe\u50cf\u5230\u56fe\u50cf\u7b49\u529f\u80fd\u7684\u8054\u5408\u8bad\u7ec3\u3002\u5176\u6838\u5fc3\u521b\u65b0\u5305\u62ec\uff1a\u591a\u6a21\u6001\u8bed\u8a00\u6a21\u578b\u4f5c\u4e3a\u6761\u4ef6\u7f16\u7801\u5668\uff0c\u6df7\u5408\u53d8\u5206\u81ea\u7f16\u7801\u5668\u89e3\u51b3\u6587\u672c\u91cd\u5efa\u7f3a\u9677\uff0c\u4ee5\u53ca\u91c7\u7528MMDiT\u67b6\u6784\u548c\u521b\u65b0\u7684MSRoPE\u4f4d\u7f6e\u7f16\u7801\u3002\u6a21\u578b\u57fa\u4e8e\u767e\u4ebf\u7ea7\u56fe\u6587\u5bf9\u6570\u636e\u96c6\uff0c\u7ecf\u8fc7\u4e03\u9636\u6bb5\u7cbe\u5bc6\u8fc7\u6ee4\u6d41\u7a0b\uff0c\u7279\u522b\u9488\u5bf9\u4e2d\u6587\u7b49\u8868\u610f\u6587\u5b57\u7684\u6e32\u67d3\u95ee\u9898\u8bbe\u8ba1\u4e86\u6587\u672c\u5408\u6210\u589e\u5f3a\u7b56\u7565\u3002\u8bad\u7ec3\u91c7\u7528\u6d41\u5339\u914d\u76ee\u6807\u51fd<\/p>\n","protected":false},"author":2,"featured_media":56921,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[52,5929,50,227,4378],"topic":[],"class_list":["post-56923","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-server","tag-aigc","tag-qwen-image","tag-50","tag-227","tag-4378"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/56923.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb7k\u6b21\uff0c\u70b9\u8d5e26\u6b21\uff0c\u6536\u85cf20\u6b21\u3002Qwen-Image\u662f\u4e00\u6b3e\u7a81\u7834\u6027\u7684\u89c6\u89c9\u751f\u6210\u6a21\u578b\uff0c\u9488\u5bf9\u6587\u672c\u6e32\u67d3\u548c\u56fe\u50cf\u7f16\u8f91\u4e24\u5927\u6838\u5fc3\u6311\u6218\u63d0\u51fa\u521b\u65b0\u89e3\u51b3\u65b9\u6848\u3002\u8be5\u6a21\u578b\u91c7\u7528\u6e10\u8fdb\u5f0f\u6587\u672c\u6e32\u67d3\u8bad\u7ec3\u548c\u53cc\u7f16\u7801\u673a\u5236\uff08Qwen2.5-VL\u8bed\u4e49\u7f16\u7801+VAE\u89c6\u89c9\u7f16\u7801\uff09\uff0c\u901a\u8fc7\u591a\u4efb\u52a1\u7edf\u4e00\u67b6\u6784\u5b9e\u73b0\u6587\u672c\u5230\u56fe\u50cf\u3001\u56fe\u50cf\u5230\u56fe\u50cf\u7b49\u529f\u80fd\u7684\u8054\u5408\u8bad\u7ec3\u3002\u5176\u6838\u5fc3\u521b\u65b0\u5305\u62ec\uff1a\u591a\u6a21\u6001\u8bed\u8a00\u6a21\u578b\u4f5c\u4e3a\u6761\u4ef6\u7f16\u7801\u5668\uff0c\u6df7\u5408\u53d8\u5206\u81ea\u7f16\u7801\u5668\u89e3\u51b3\u6587\u672c\u91cd\u5efa\u7f3a\u9677\uff0c\u4ee5\u53ca\u91c7\u7528MMDiT\u67b6\u6784\u548c\u521b\u65b0\u7684MSRoPE\u4f4d\u7f6e\u7f16\u7801\u3002\u6a21\u578b\u57fa\u4e8e\u767e\u4ebf\u7ea7\u56fe\u6587\u5bf9\u6570\u636e\u96c6\uff0c\u7ecf\u8fc7\u4e03\u9636\u6bb5\u7cbe\u5bc6\u8fc7\u6ee4\u6d41\u7a0b\uff0c\u7279\u522b\u9488\u5bf9\u4e2d\u6587\u7b49\u8868\u610f\u6587\u5b57\u7684\u6e32\u67d3\u95ee\u9898\u8bbe\u8ba1\u4e86\u6587\u672c\u5408\u6210\u589e\u5f3a\u7b56\u7565\u3002\u8bad\u7ec3\u91c7\u7528\u6d41\u5339\u914d\u76ee\u6807\u51fd\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/56923.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2025-08-14T17:14:54+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/08\/20250814171450-689e198ae2a8a.png\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"8 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/56923.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/56923.html\",\"name\":\"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2025-08-14T17:14:54+00:00\",\"dateModified\":\"2025-08-14T17:14:54+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/56923.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/56923.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/56923.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/56923.html","og_locale":"zh_CN","og_type":"article","og_title":"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u6587\u7ae0\u6d4f\u89c8\u9605\u8bfb7k\u6b21\uff0c\u70b9\u8d5e26\u6b21\uff0c\u6536\u85cf20\u6b21\u3002Qwen-Image\u662f\u4e00\u6b3e\u7a81\u7834\u6027\u7684\u89c6\u89c9\u751f\u6210\u6a21\u578b\uff0c\u9488\u5bf9\u6587\u672c\u6e32\u67d3\u548c\u56fe\u50cf\u7f16\u8f91\u4e24\u5927\u6838\u5fc3\u6311\u6218\u63d0\u51fa\u521b\u65b0\u89e3\u51b3\u65b9\u6848\u3002\u8be5\u6a21\u578b\u91c7\u7528\u6e10\u8fdb\u5f0f\u6587\u672c\u6e32\u67d3\u8bad\u7ec3\u548c\u53cc\u7f16\u7801\u673a\u5236\uff08Qwen2.5-VL\u8bed\u4e49\u7f16\u7801+VAE\u89c6\u89c9\u7f16\u7801\uff09\uff0c\u901a\u8fc7\u591a\u4efb\u52a1\u7edf\u4e00\u67b6\u6784\u5b9e\u73b0\u6587\u672c\u5230\u56fe\u50cf\u3001\u56fe\u50cf\u5230\u56fe\u50cf\u7b49\u529f\u80fd\u7684\u8054\u5408\u8bad\u7ec3\u3002\u5176\u6838\u5fc3\u521b\u65b0\u5305\u62ec\uff1a\u591a\u6a21\u6001\u8bed\u8a00\u6a21\u578b\u4f5c\u4e3a\u6761\u4ef6\u7f16\u7801\u5668\uff0c\u6df7\u5408\u53d8\u5206\u81ea\u7f16\u7801\u5668\u89e3\u51b3\u6587\u672c\u91cd\u5efa\u7f3a\u9677\uff0c\u4ee5\u53ca\u91c7\u7528MMDiT\u67b6\u6784\u548c\u521b\u65b0\u7684MSRoPE\u4f4d\u7f6e\u7f16\u7801\u3002\u6a21\u578b\u57fa\u4e8e\u767e\u4ebf\u7ea7\u56fe\u6587\u5bf9\u6570\u636e\u96c6\uff0c\u7ecf\u8fc7\u4e03\u9636\u6bb5\u7cbe\u5bc6\u8fc7\u6ee4\u6d41\u7a0b\uff0c\u7279\u522b\u9488\u5bf9\u4e2d\u6587\u7b49\u8868\u610f\u6587\u5b57\u7684\u6e32\u67d3\u95ee\u9898\u8bbe\u8ba1\u4e86\u6587\u672c\u5408\u6210\u589e\u5f3a\u7b56\u7565\u3002\u8bad\u7ec3\u91c7\u7528\u6d41\u5339\u914d\u76ee\u6807\u51fd","og_url":"https:\/\/www.wsisp.com\/helps\/56923.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2025-08-14T17:14:54+00:00","og_image":[{"url":"https:\/\/www.wsisp.com\/helps\/wp-content\/uploads\/2025\/08\/20250814171450-689e198ae2a8a.png"}],"author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"8 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/56923.html","url":"https:\/\/www.wsisp.com\/helps\/56923.html","name":"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2025-08-14T17:14:54+00:00","dateModified":"2025-08-14T17:14:54+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/56923.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/56923.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/56923.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"Qwen-Image\u6df1\u5ea6\u89e3\u6790\uff1a\u7a81\u7834\u6587\u672c\u6e32\u67d3\u4e0e\u56fe\u50cf\u7f16\u8f91\u7684\u89c6\u89c9\u751f\u6210\u5927\u6a21\u578b"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/56923","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=56923"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/56923\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media\/56921"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=56923"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=56923"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=56923"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=56923"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}