{"id":76997,"date":"2026-02-23T19:09:24","date_gmt":"2026-02-23T11:09:24","guid":{"rendered":"https:\/\/www.wsisp.com\/helps\/76997.html"},"modified":"2026-02-23T19:09:24","modified_gmt":"2026-02-23T11:09:24","slug":"%e6%9e%84%e5%bb%ba%e5%a4%a7%e8%a7%84%e6%a8%a1%e8%89%ba%e6%9c%af%e4%bd%9c%e5%93%81%e4%bf%a1%e6%81%af%e6%95%b0%e6%8d%ae%e5%ba%93%ef%bc%9a%e4%bb%8e%e9%9b%b6%e5%bc%80%e5%a7%8b%e7%9a%84python%e7%88%ac","status":"publish","type":"post","link":"https:\/\/www.wsisp.com\/helps\/76997.html","title":{"rendered":"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357"},"content":{"rendered":"<h3>\u4e00\u3001\u5f15\u8a00&#xff1a;\u4e3a\u4ec0\u4e48\u9700\u8981\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93&#xff1f;<\/h3>\n<p>\u5728\u6570\u5b57\u65f6\u4ee3&#xff0c;\u827a\u672f\u4f5c\u54c1\u7684\u6570\u5b57\u5316\u7ba1\u7406\u548c\u5206\u6790\u5df2\u6210\u4e3a\u827a\u672f\u754c\u3001\u5b66\u672f\u754c\u548c\u5546\u4e1a\u9886\u57df\u7684\u91cd\u8981\u9700\u6c42\u3002\u65e0\u8bba\u662f\u535a\u7269\u9986\u7684\u85cf\u54c1\u7ba1\u7406\u3001\u827a\u672f\u5e02\u573a\u7684\u4ef7\u683c\u5206\u6790&#xff0c;\u8fd8\u662fAI\u827a\u672f\u521b\u4f5c\u7684\u8bad\u7ec3\u6570\u636e&#xff0c;\u4e00\u4e2a\u7ed3\u6784\u5b8c\u5584\u7684\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93\u90fd\u626e\u6f14\u7740\u81f3\u5173\u91cd\u8981\u7684\u89d2\u8272\u3002<\/p>\n<h4>1.1 \u827a\u672f\u4f5c\u54c1\u6570\u636e\u7684\u4ef7\u503c<\/h4>\n<p>\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5305\u542b\u4e86\u4e30\u5bcc\u7684\u4fe1\u606f\u7ef4\u5ea6&#xff1a;\u521b\u4f5c\u80cc\u666f\u3001\u827a\u672f\u5bb6\u751f\u5e73\u3001\u6280\u6cd5\u6750\u6599\u3001\u5c3a\u5bf8\u89c4\u683c\u3001\u6536\u85cf\u5386\u53f2\u3001\u5e02\u573a\u4ef7\u503c\u7b49\u3002\u8fd9\u4e9b\u6570\u636e\u4e0d\u4ec5\u5bf9\u4e8e\u827a\u672f\u53f2\u7814\u7a76\u5177\u6709\u5b66\u672f\u4ef7\u503c&#xff0c;\u8fd8\u5728\u4ee5\u4e0b\u9886\u57df\u53d1\u6325\u7740\u91cd\u8981\u4f5c\u7528&#xff1a;<\/p>\n<ul>\n<li>\n<p>\u827a\u672f\u5e02\u573a\u5206\u6790&#xff1a;\u901a\u8fc7\u5206\u6790\u4f5c\u54c1\u7684\u6210\u4ea4\u4ef7\u683c\u3001\u6d41\u901a\u9891\u7387\u7b49\u6570\u636e&#xff0c;\u53ef\u4ee5\u6d1e\u5bdf\u827a\u672f\u5e02\u573a\u7684\u8d70\u52bf\u548c\u89c4\u5f8b<\/p>\n<\/li>\n<li>\n<p>\u827a\u672f\u54c1\u9274\u5b9a&#xff1a;\u57fa\u4e8e\u5df2\u77e5\u4f5c\u54c1\u7684\u6280\u6cd5\u7279\u5f81\u3001\u6750\u6599\u4f7f\u7528\u7b49\u6570\u636e&#xff0c;\u4e3a\u9274\u5b9a\u63d0\u4f9b\u53c2\u8003\u4f9d\u636e<\/p>\n<\/li>\n<li>\n<p>\u6570\u5b57\u5316\u5c55\u89c8&#xff1a;\u6784\u5efa\u865a\u62df\u535a\u7269\u9986\u548c\u5728\u7ebf\u5c55\u89c8\u7684\u57fa\u7840\u6570\u636e\u652f\u6491<\/p>\n<\/li>\n<li>\n<p>\u827a\u672f\u6559\u80b2&#xff1a;\u4e3a\u5b66\u751f\u548c\u7231\u597d\u8005\u63d0\u4f9b\u7cfb\u7edf\u5316\u7684\u827a\u672f\u77e5\u8bc6\u5b66\u4e60\u8d44\u6e90<\/p>\n<\/li>\n<li>\n<p>AI\u827a\u672f\u521b\u4f5c&#xff1a;\u4e3a\u751f\u6210\u5bf9\u6297\u7f51\u7edc\u7b49\u6df1\u5ea6\u5b66\u4e60\u6a21\u578b\u63d0\u4f9b\u8bad\u7ec3\u6570\u636e<\/p>\n<\/li>\n<\/ul>\n<h4>1.2 \u4f20\u7edf\u6570\u636e\u6536\u96c6\u65b9\u5f0f\u7684\u5c40\u9650\u6027<\/h4>\n<p>\u8fc7\u53bb&#xff0c;\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u7684\u6536\u96c6\u4e3b\u8981\u4f9d\u8d56\u4eba\u5de5\u6574\u7406&#xff0c;\u8fd9\u79cd\u65b9\u5f0f\u5b58\u5728\u660e\u663e\u4e0d\u8db3&#xff1a;<\/p>\n<ul>\n<li>\n<p>\u6548\u7387\u4f4e\u4e0b&#xff1a;\u624b\u52a8\u5f55\u5165\u4e00\u4ef6\u4f5c\u54c1\u53ef\u80fd\u9700\u8981\u6570\u5206\u949f&#xff0c;\u9762\u5bf9\u767e\u4e07\u7ea7\u4f5c\u54c1\u6570\u636e\u65f6\u51e0\u4e4e\u4e0d\u53ef\u884c<\/p>\n<\/li>\n<li>\n<p>\u66f4\u65b0\u6ede\u540e&#xff1a;\u65b0\u5c55\u89c8\u3001\u65b0\u4ea4\u6613\u7684\u4fe1\u606f\u65e0\u6cd5\u53ca\u65f6\u6536\u5f55<\/p>\n<\/li>\n<li>\n<p>\u8986\u76d6\u4e0d\u5168&#xff1a;\u53d7\u9650\u4e8e\u4eba\u529b&#xff0c;\u96be\u4ee5\u540c\u65f6\u5173\u6ce8\u591a\u4e2a\u827a\u672f\u673a\u6784\u548c\u6e20\u9053<\/p>\n<\/li>\n<li>\n<p>\u6807\u51c6\u5316\u56f0\u96be&#xff1a;\u4e0d\u540c\u6765\u6e90\u7684\u6570\u636e\u683c\u5f0f\u5404\u5f02&#xff0c;\u6574\u5408\u6210\u672c\u9ad8<\/p>\n<\/li>\n<\/ul>\n<h4>1.3 Python\u722c\u866b\u7684\u89e3\u51b3\u65b9\u6848<\/h4>\n<p>Python\u722c\u866b\u6280\u672f\u4e3a\u6211\u4eec\u63d0\u4f9b\u4e86\u4e00\u4e2a\u9ad8\u6548\u3001\u53ef\u6269\u5c55\u7684\u6570\u636e\u91c7\u96c6\u65b9\u6848\u3002\u901a\u8fc7\u7f16\u5199\u81ea\u52a8\u5316\u7a0b\u5e8f&#xff0c;\u6211\u4eec\u53ef\u4ee5&#xff1a;<\/p>\n<li>\n<p>\u5927\u89c4\u6a21\u91c7\u96c6&#xff1a;\u540c\u65f6\u6293\u53d6\u591a\u4e2a\u827a\u672f\u7f51\u7ad9\u7684\u6570\u767e\u4e07\u6761\u4f5c\u54c1\u4fe1\u606f<\/p>\n<\/li>\n<li>\n<p>\u5b9e\u65f6\u66f4\u65b0&#xff1a;\u8bbe\u7f6e\u5b9a\u65f6\u4efb\u52a1&#xff0c;\u786e\u4fdd\u6570\u636e\u7684\u65b0\u9c9c\u5ea6<\/p>\n<\/li>\n<li>\n<p>\u667a\u80fd\u89e3\u6790&#xff1a;\u4f7f\u7528\u81ea\u7136\u8bed\u8a00\u5904\u7406\u548c\u56fe\u50cf\u8bc6\u522b\u6280\u672f\u63d0\u53d6\u7ed3\u6784\u5316\u4fe1\u606f<\/p>\n<\/li>\n<li>\n<p>\u5f02\u5e38\u5904\u7406&#xff1a;\u5e94\u5bf9\u7f51\u7ad9\u6539\u7248\u3001\u7f51\u7edc\u6ce2\u52a8\u7b49\u590d\u6742\u60c5\u51b5<\/p>\n<\/li>\n<li>\n<p>\u6570\u636e\u6e05\u6d17&#xff1a;\u81ea\u52a8\u53bb\u91cd\u3001\u683c\u5f0f\u8f6c\u6362\u3001\u7f3a\u5931\u503c\u5904\u7406<\/p>\n<\/li>\n<h3>\u4e8c\u3001\u9879\u76ee\u89c4\u5212\u4e0e\u6280\u672f\u9009\u578b<\/h3>\n<p>\u5728\u5f00\u59cb\u7f16\u5199\u4ee3\u7801\u4e4b\u524d&#xff0c;\u6211\u4eec\u9700\u8981\u5bf9\u6574\u4e2a\u9879\u76ee\u8fdb\u884c\u7cfb\u7edf\u6027\u7684\u89c4\u5212&#xff0c;\u660e\u786e\u6280\u672f\u8def\u7ebf\u548c\u67b6\u6784\u8bbe\u8ba1\u3002<\/p>\n<h4>2.1 \u6570\u636e\u6765\u6e90\u5206\u6790<\/h4>\n<p>\u6211\u4eec\u5c06\u4ece\u4ee5\u4e0b\u51e0\u4e2a\u4e3b\u8981\u6e20\u9053\u91c7\u96c6\u827a\u672f\u4f5c\u54c1\u6570\u636e&#xff1a;<\/p>\n<p>2.1.1 \u4e3b\u6d41\u827a\u672f\u5e73\u53f0<\/p>\n<ul>\n<li>\n<p>WikiArt&#xff1a;\u5305\u542b\u8d85\u8fc725\u4e07\u4ef6\u827a\u672f\u4f5c\u54c1&#xff0c;\u6db5\u76d6\u6570\u5343\u4f4d\u827a\u672f\u5bb6<\/p>\n<\/li>\n<li>\n<p>Artsy&#xff1a;\u5f53\u4ee3\u827a\u672f\u5e73\u53f0&#xff0c;\u63d0\u4f9b\u8be6\u7ec6\u7684\u827a\u672f\u5bb6\u548c\u4f5c\u54c1\u4fe1\u606f<\/p>\n<\/li>\n<li>\n<p>Art Institute of Chicago&#xff1a;\u5f00\u653eAPI&#xff0c;\u63d0\u4f9b\u9ad8\u8d28\u91cf\u7684\u9986\u85cf\u6570\u636e<\/p>\n<\/li>\n<li>\n<p>\u5927\u90fd\u4f1a\u827a\u672f\u535a\u7269\u9986&#xff1a;\u5b8c\u6574\u7684\u516c\u5f00\u6570\u636e\u96c6<\/p>\n<\/li>\n<li>\n<p>\u6b27\u6d32\u6570\u5b57\u56fe\u4e66\u9986&#xff1a;\u805a\u5408\u6b27\u6d32\u591a\u5bb6\u535a\u7269\u9986\u7684\u85cf\u54c1\u6570\u636e<\/p>\n<\/li>\n<\/ul>\n<p>2.1.2 \u76ee\u6807\u6570\u636e\u5b57\u6bb5<\/p>\n<p>\u6211\u4eec\u9700\u8981\u91c7\u96c6\u7684\u7ed3\u6784\u5316\u4fe1\u606f\u5305\u62ec&#xff1a;<\/p>\n<p>python<\/p>\n<p>\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u7ed3\u6784 &#061; {<br \/>\n    &#034;basic_info&#034;: {<br \/>\n        &#034;title&#034;: &#034;\u4f5c\u54c1\u540d\u79f0&#034;,<br \/>\n        &#034;artist&#034;: &#034;\u827a\u672f\u5bb6&#034;,<br \/>\n        &#034;date&#034;: &#034;\u521b\u4f5c\u5e74\u4efd&#034;,<br \/>\n        &#034;medium&#034;: &#034;\u6750\u8d28\/\u6280\u6cd5&#034;,<br \/>\n        &#034;dimensions&#034;: &#034;\u5c3a\u5bf8&#xff08;\u9ad8\u00d7\u5bbd&#xff09;&#034;,<br \/>\n        &#034;collection&#034;: &#034;\u6536\u85cf\u673a\u6784&#034;<br \/>\n    },<br \/>\n    &#034;detailed_info&#034;: {<br \/>\n        &#034;description&#034;: &#034;\u4f5c\u54c1\u63cf\u8ff0&#034;,<br \/>\n        &#034;provenance&#034;: &#034;\u6765\u6e90\u5386\u53f2&#034;,<br \/>\n        &#034;exhibition_history&#034;: &#034;\u5c55\u89c8\u5386\u53f2&#034;,<br \/>\n        &#034;bibliography&#034;: &#034;\u53c2\u8003\u6587\u732e&#034;,<br \/>\n        &#034;signature&#034;: &#034;\u7b7e\u540d\u4f4d\u7f6e&#034;<br \/>\n    },<br \/>\n    &#034;market_info&#034;: {<br \/>\n        &#034;estimated_price&#034;: &#034;\u4f30\u4ef7&#034;,<br \/>\n        &#034;sale_price&#034;: &#034;\u6210\u4ea4\u4ef7&#034;,<br \/>\n        &#034;auction_house&#034;: &#034;\u62cd\u5356\u884c&#034;,<br \/>\n        &#034;sale_date&#034;: &#034;\u62cd\u5356\u65e5\u671f&#034;<br \/>\n    },<br \/>\n    &#034;media_info&#034;: {<br \/>\n        &#034;image_url&#034;: &#034;\u56fe\u7247\u94fe\u63a5&#034;,<br \/>\n        &#034;thumbnail_url&#034;: &#034;\u7f29\u7565\u56fe\u94fe\u63a5&#034;,<br \/>\n        &#034;image_metadata&#034;: &#034;\u56fe\u7247\u5143\u6570\u636e&#034;<br \/>\n    }<br \/>\n} <\/p>\n<h4>2.2 \u6280\u672f\u6808\u9009\u62e9<\/h4>\n<p>\u57fa\u4e8e2024-2025\u5e74\u7684\u6700\u65b0\u6280\u672f\u53d1\u5c55&#xff0c;\u6211\u4eec\u9009\u62e9\u4ee5\u4e0b\u6280\u672f\u6808&#xff1a;<\/p>\n<p>2.2.1 \u6838\u5fc3\u722c\u866b\u6846\u67b6<\/p>\n<p>python<\/p>\n<p># \u73af\u5883\u8981\u6c42<br \/>\nPython &gt;&#061; 3.11<br \/>\naiohttp &gt;&#061; 3.9.0  # \u5f02\u6b65HTTP\u5ba2\u6237\u7aef<br \/>\nhttpx &gt;&#061; 0.27.0   # \u652f\u6301HTTP\/2\u7684\u5ba2\u6237\u7aef<br \/>\nScrapy &gt;&#061; 2.11    # \u5206\u5e03\u5f0f\u722c\u866b\u6846\u67b6<br \/>\nPlaywright &gt;&#061; 1.40 # \u65e0\u5934\u6d4f\u89c8\u5668\u81ea\u52a8\u5316<br \/>\nSelenium &gt;&#061; 4.15  # Web\u81ea\u52a8\u5316\u6d4b\u8bd5\u5de5\u5177 <\/p>\n<p>2.2.2 \u6570\u636e\u89e3\u6790\u4e0e\u5904\u7406<\/p>\n<p>python<\/p>\n<p># \u6570\u636e\u89e3\u6790\u5e93<br \/>\nBeautifulSoup4 &gt;&#061; 4.12  # HTML\u89e3\u6790<br \/>\nlxml &gt;&#061; 4.9             # \u9ad8\u6027\u80fdXML\u89e3\u6790<br \/>\nparsel &gt;&#061; 1.8           # \u57fa\u4e8elxml\u7684\u9009\u62e9\u5668<br \/>\njsonpath-ng &gt;&#061; 1.6      # JSON\u8def\u5f84\u67e5\u8be2<br \/>\njmespath &gt;&#061; 1.0         # \u53e6\u4e00\u4e2aJSON\u67e5\u8be2\u5e93 <\/p>\n<p>2.2.3 \u6570\u636e\u5b58\u50a8\u65b9\u6848<\/p>\n<p>python<\/p>\n<p># \u6570\u636e\u5e93\u9009\u62e9<br \/>\nMongoDB &gt;&#061; 7.0          # \u6587\u6863\u6570\u636e\u5e93&#xff0c;\u9002\u5408\u5b58\u50a8JSON\u683c\u5f0f\u7684\u827a\u672f\u54c1\u6570\u636e<br \/>\nPostgreSQL &gt;&#061; 16        # \u5173\u7cfb\u578b\u6570\u636e\u5e93&#xff0c;\u9002\u5408\u7ed3\u6784\u5316\u6570\u636e\u5b58\u50a8<br \/>\nRedis &gt;&#061; 7.2            # \u7f13\u5b58\u548c\u4efb\u52a1\u961f\u5217<br \/>\nElasticsearch &gt;&#061; 8.11   # \u5168\u6587\u641c\u7d22\u548c\u5206\u6790<br \/>\nMinIO &gt;&#061; RELEASE.2024   # \u5bf9\u8c61\u5b58\u50a8&#xff0c;\u5b58\u50a8\u56fe\u7247\u7b49\u5a92\u4f53\u6587\u4ef6 <\/p>\n<p>2.2.4 \u6570\u636e\u6e05\u6d17\u4e0e\u589e\u5f3a<\/p>\n<p>python<\/p>\n<p># \u6570\u636e\u5904\u7406\u5de5\u5177<br \/>\npandas &gt;&#061; 2.1           # \u6570\u636e\u5206\u6790<br \/>\nnumpy &gt;&#061; 1.26           # \u6570\u503c\u8ba1\u7b97<br \/>\npillow &gt;&#061; 10.1          # \u56fe\u50cf\u5904\u7406<br \/>\nopencv-python &gt;&#061; 4.9    # \u8ba1\u7b97\u673a\u89c6\u89c9<br \/>\npytesseract &gt;&#061; 0.3.10   # OCR\u6587\u5b57\u8bc6\u522b <\/p>\n<h4>2.3 \u7cfb\u7edf\u67b6\u6784\u8bbe\u8ba1<\/h4>\n<p>\u6211\u4eec\u5c06\u91c7\u7528\u5206\u5e03\u5f0f\u5fae\u670d\u52a1\u67b6\u6784&#xff0c;\u786e\u4fdd\u7cfb\u7edf\u7684\u53ef\u6269\u5c55\u6027\u548c\u7a33\u5b9a\u6027&#xff1a;<\/p>\n<p>text<\/p>\n<p>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502                    \u8c03\u5ea6\u4e2d\u5fc3                           \u2502<br \/>\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510          \u2502<br \/>\n\u2502  \u2502\u5b9a\u65f6\u8c03\u5ea6\u5668 \u2502  \u2502\u4f18\u5148\u7ea7\u961f\u5217 \u2502  \u2502\u5931\u8d25\u91cd\u8bd5\u5668 \u2502          \u2502<br \/>\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518          \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n                            \u2502<br \/>\n        \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n        \u25bc                   \u25bc                   \u25bc<br \/>\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502  \u722c\u866b\u8282\u70b91     \u2502    \u2502  \u722c\u866b\u8282\u70b92     \u2502    \u2502  \u722c\u866b\u8282\u70b9N     \u2502<br \/>\n\u2502  &#8211; WikiArt    \u2502    \u2502  &#8211; Artsy      \u2502    \u2502  &#8211; \u5176\u4ed6\u5e73\u53f0    \u2502<br \/>\n\u2502  &#8211; \u901f\u7387\u9650\u5236    \u2502    \u2502  &#8211; \u4ee3\u7406IP\u6c60    \u2502    \u2502  &#8211; \u5206\u5e03\u5f0f      \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n        \u2502                   \u2502                   \u2502<br \/>\n        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n                            \u25bc<br \/>\n                \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n                \u2502   \u6d88\u606f\u961f\u5217&#xff08;Kafka&#xff09;   \u2502<br \/>\n                \u2502   &#8211; \u6570\u636e\u5206\u53d1         \u2502<br \/>\n                \u2502   &#8211; \u6d41\u91cf\u524a\u5cf0         \u2502<br \/>\n                \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n                            \u2502<br \/>\n        \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n        \u25bc                   \u25bc                   \u25bc<br \/>\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510    \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n\u2502  \u6e05\u6d17\u7ba1\u90531     \u2502    \u2502  \u6e05\u6d17\u7ba1\u90532     \u2502    \u2502  \u6e05\u6d17\u7ba1\u9053N     \u2502<br \/>\n\u2502  &#8211; \u6570\u636e\u9a8c\u8bc1    \u2502    \u2502  &#8211; \u683c\u5f0f\u8f6c\u6362    \u2502    \u2502  &#8211; \u6570\u636e\u589e\u5f3a    \u2502<br \/>\n\u2502  &#8211; \u53bb\u91cd\u5904\u7406    \u2502    \u2502  &#8211; \u5b9e\u4f53\u8bc6\u522b    \u2502    \u2502  &#8211; \u56fe\u50cf\u5904\u7406    \u2502<br \/>\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518    \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n        \u2502                   \u2502                   \u2502<br \/>\n        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518<br \/>\n                            \u25bc<br \/>\n                \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510<br \/>\n                \u2502    \u6570\u636e\u5b58\u50a8\u5c42         \u2502<br \/>\n                \u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510  \u2502<br \/>\n                \u2502  \u2502   MongoDB     \u2502  \u2502<br \/>\n                \u2502  \u2502   PostgreSQL  \u2502  \u2502<br \/>\n                \u2502  \u2502   Elasticsearch\u2502  \u2502<br \/>\n                \u2502  \u2502   MinIO       \u2502  \u2502<br \/>\n                \u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518  \u2502<br \/>\n                \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518 <\/p>\n<h3>\u4e09\u3001\u73af\u5883\u642d\u5efa\u4e0e\u57fa\u7840\u914d\u7f6e<\/h3>\n<h4>3.1 \u5f00\u53d1\u73af\u5883\u51c6\u5907<\/h4>\n<p>\u9996\u5148&#xff0c;\u6211\u4eec\u9700\u8981\u521b\u5efa\u4e00\u4e2a\u72ec\u7acb\u7684Python\u73af\u5883&#xff0c;\u786e\u4fdd\u4f9d\u8d56\u5305\u7248\u672c\u7684\u4e00\u81f4\u6027&#xff1a;<\/p>\n<p>bash<\/p>\n<p># \u521b\u5efa\u865a\u62df\u73af\u5883<br \/>\npython3.11 -m venv art_crawler_env<\/p>\n<p># \u6fc0\u6d3b\u865a\u62df\u73af\u5883&#xff08;Linux\/Mac&#xff09;<br \/>\nsource art_crawler_env\/bin\/activate<\/p>\n<p># \u6fc0\u6d3b\u865a\u62df\u73af\u5883&#xff08;Windows&#xff09;<br \/>\nart_crawler_env\\\\Scripts\\\\activate<\/p>\n<p># \u5347\u7ea7pip\u548csetuptools<br \/>\npython -m pip install &#8211;upgrade pip setuptools wheel <\/p>\n<h4>3.2 \u521b\u5efa\u9879\u76ee\u7ed3\u6784<\/h4>\n<p>\u4e00\u4e2a\u6e05\u6670\u7684\u9879\u76ee\u7ed3\u6784\u5bf9\u4e8e\u5927\u578b\u722c\u866b\u9879\u76ee\u81f3\u5173\u91cd\u8981&#xff1a;<\/p>\n<p>bash<\/p>\n<p>art_crawler_project\/<br \/>\n\u251c\u2500\u2500 docker\/                      # Docker\u914d\u7f6e<br \/>\n\u2502   \u251c\u2500\u2500 docker-compose.yml       # \u670d\u52a1\u7f16\u6392<br \/>\n\u2502   \u251c\u2500\u2500 mongodb\/                  # MongoDB\u914d\u7f6e<br \/>\n\u2502   \u251c\u2500\u2500 postgres\/                  # PostgreSQL\u914d\u7f6e<br \/>\n\u2502   \u2514\u2500\u2500 redis\/                    # Redis\u914d\u7f6e<br \/>\n\u251c\u2500\u2500 src\/                          # \u6e90\u4ee3\u7801<br \/>\n\u2502   \u251c\u2500\u2500 crawlers\/                  # \u722c\u866b\u6a21\u5757<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 base.py               # \u57fa\u7840\u722c\u866b\u7c7b<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 wikiart.py            # WikiArt\u722c\u866b<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 artsy.py              # Artsy\u722c\u866b<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 museum_api.py         # \u535a\u7269\u9986API\u722c\u866b<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 middlewares\/          # \u4e2d\u95f4\u4ef6<br \/>\n\u2502   \u2502       \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502       \u251c\u2500\u2500 proxy.py          # \u4ee3\u7406\u4e2d\u95f4\u4ef6<br \/>\n\u2502   \u2502       \u251c\u2500\u2500 retry.py          # \u91cd\u8bd5\u4e2d\u95f4\u4ef6<br \/>\n\u2502   \u2502       \u2514\u2500\u2500 rate_limiter.py   # \u901f\u7387\u9650\u5236<br \/>\n\u2502   \u251c\u2500\u2500 parsers\/                   # \u89e3\u6790\u5668<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 html_parser.py        # HTML\u89e3\u6790\u5668<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 json_parser.py        # JSON\u89e3\u6790\u5668<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 image_parser.py       # \u56fe\u50cf\u5143\u6570\u636e\u89e3\u6790<br \/>\n\u2502   \u251c\u2500\u2500 pipelines\/                 # \u6570\u636e\u7ba1\u9053<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 validation.py         # \u6570\u636e\u9a8c\u8bc1<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 deduplication.py      # \u6570\u636e\u53bb\u91cd<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 enrichment.py         # \u6570\u636e\u589e\u5f3a<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 storage.py            # \u6570\u636e\u5b58\u50a8<br \/>\n\u2502   \u251c\u2500\u2500 models\/                    # \u6570\u636e\u6a21\u578b<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 artwork.py            # \u4f5c\u54c1\u6a21\u578b<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 artist.py             # \u827a\u672f\u5bb6\u6a21\u578b<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 museum.py             # \u535a\u7269\u9986\u6a21\u578b<br \/>\n\u2502   \u251c\u2500\u2500 utils\/                     # \u5de5\u5177\u51fd\u6570<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 logger.py             # \u65e5\u5fd7\u5de5\u5177<br \/>\n\u2502   \u2502   \u251c\u2500\u2500 validator.py          # \u9a8c\u8bc1\u5de5\u5177<br \/>\n\u2502   \u2502   \u2514\u2500\u2500 helpers.py            # \u8f85\u52a9\u51fd\u6570<br \/>\n\u2502   \u2514\u2500\u2500 config\/                    # \u914d\u7f6e\u6587\u4ef6<br \/>\n\u2502       \u251c\u2500\u2500 __init__.py<br \/>\n\u2502       \u251c\u2500\u2500 settings.py           # \u722c\u866b\u914d\u7f6e<br \/>\n\u2502       \u2514\u2500\u2500 constants.py          # \u5e38\u91cf\u5b9a\u4e49<br \/>\n\u251c\u2500\u2500 tests\/                         # \u6d4b\u8bd5\u4ee3\u7801<br \/>\n\u2502   \u251c\u2500\u2500 __init__.py<br \/>\n\u2502   \u251c\u2500\u2500 test_crawlers\/<br \/>\n\u2502   \u251c\u2500\u2500 test_parsers\/<br \/>\n\u2502   \u2514\u2500\u2500 test_pipelines\/<br \/>\n\u251c\u2500\u2500 scripts\/                       # \u811a\u672c\u5de5\u5177<br \/>\n\u2502   \u251c\u2500\u2500 init_db.py                # \u6570\u636e\u5e93\u521d\u59cb\u5316<br \/>\n\u2502   \u251c\u2500\u2500 run_crawler.py            # \u8fd0\u884c\u722c\u866b<br \/>\n\u2502   \u2514\u2500\u2500 clean_data.py             # \u6570\u636e\u6e05\u7406<br \/>\n\u251c\u2500\u2500 requirements\/                  # \u4f9d\u8d56\u7ba1\u7406<br \/>\n\u2502   \u251c\u2500\u2500 base.txt                  # \u57fa\u7840\u4f9d\u8d56<br \/>\n\u2502   \u251c\u2500\u2500 dev.txt                   # \u5f00\u53d1\u4f9d\u8d56<br \/>\n\u2502   \u2514\u2500\u2500 prod.txt                  # \u751f\u4ea7\u4f9d\u8d56<br \/>\n\u251c\u2500\u2500 .env.example                   # \u73af\u5883\u53d8\u91cf\u793a\u4f8b<br \/>\n\u251c\u2500\u2500 .gitignore                     # Git\u5ffd\u7565\u6587\u4ef6<br \/>\n\u251c\u2500\u2500 README.md                      # \u9879\u76ee\u8bf4\u660e<br \/>\n\u2514\u2500\u2500 setup.py                       # \u5b89\u88c5\u811a\u672c <\/p>\n<h4>3.3 \u914d\u7f6e\u6587\u4ef6\u7f16\u5199<\/h4>\n<p>3.3.1 \u57fa\u7840\u914d\u7f6e (src\/config\/settings.py)<\/p>\n<p>python<\/p>\n<p>import os<br \/>\nfrom pathlib import Path<br \/>\nfrom typing import Dict, List, Optional<br \/>\nfrom pydantic_settings import BaseSettings<br \/>\nfrom pydantic import Field, validator<\/p>\n<p>class CrawlerSettings(BaseSettings):<br \/>\n    &#034;&#034;&#034;\u722c\u866b\u914d\u7f6e\u7c7b&#xff0c;\u4f7f\u7528Pydantic\u8fdb\u884c\u914d\u7f6e\u9a8c\u8bc1&#034;&#034;&#034;<\/p>\n<p>    # \u9879\u76ee\u8def\u5f84<br \/>\n    BASE_DIR: Path &#061; Path(__file__).resolve().parent.parent.parent<br \/>\n    LOG_DIR: Path &#061; BASE_DIR \/ &#034;logs&#034;<br \/>\n    DATA_DIR: Path &#061; BASE_DIR \/ &#034;data&#034;<\/p>\n<p>    # \u722c\u866b\u914d\u7f6e<br \/>\n    USER_AGENTS: List[str] &#061; [<br \/>\n        &#034;Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/120.0.0.0 Safari\/537.36&#034;,<br \/>\n        &#034;Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit\/605.1.15 (KHTML, like Gecko) Version\/17.1 Safari\/605.1.15&#034;,<br \/>\n        &#034;Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/120.0.0.0 Safari\/537.36&#034;<br \/>\n    ]<\/p>\n<p>    # \u8bf7\u6c42\u914d\u7f6e<br \/>\n    REQUEST_TIMEOUT: int &#061; 30<br \/>\n    MAX_RETRIES: int &#061; 3<br \/>\n    RETRY_DELAY: int &#061; 5<br \/>\n    CONCURRENT_REQUESTS: int &#061; 10<br \/>\n    DOWNLOAD_DELAY: float &#061; 1.0<\/p>\n<p>    # \u4ee3\u7406\u914d\u7f6e<br \/>\n    USE_PROXY: bool &#061; False<br \/>\n    PROXY_POOL: List[str] &#061; []<br \/>\n    PROXY_ROTATION_INTERVAL: int &#061; 300  # 5\u5206\u949f\u66f4\u6362\u4e00\u6b21\u4ee3\u7406<\/p>\n<p>    # \u6570\u636e\u5e93\u914d\u7f6e<br \/>\n    MONGODB_URI: str &#061; Field(default&#061;&#034;mongodb:\/\/localhost:27017\/&#034;, env&#061;&#034;MONGODB_URI&#034;)<br \/>\n    MONGODB_DB: str &#061; &#034;art_database&#034;<br \/>\n    MONGODB_COLLECTION: str &#061; &#034;artworks&#034;<\/p>\n<p>    POSTGRESQL_CONFIG: Dict &#061; {<br \/>\n        &#034;host&#034;: os.getenv(&#034;POSTGRES_HOST&#034;, &#034;localhost&#034;),<br \/>\n        &#034;port&#034;: int(os.getenv(&#034;POSTGRES_PORT&#034;, 5432)),<br \/>\n        &#034;database&#034;: os.getenv(&#034;POSTGRES_DB&#034;, &#034;art_db&#034;),<br \/>\n        &#034;user&#034;: os.getenv(&#034;POSTGRES_USER&#034;, &#034;art_user&#034;),<br \/>\n        &#034;password&#034;: os.getenv(&#034;POSTGRES_PASSWORD&#034;, &#034;art_password&#034;)<br \/>\n    }<\/p>\n<p>    REDIS_CONFIG: Dict &#061; {<br \/>\n        &#034;host&#034;: os.getenv(&#034;REDIS_HOST&#034;, &#034;localhost&#034;),<br \/>\n        &#034;port&#034;: int(os.getenv(&#034;REDIS_PORT&#034;, 6379)),<br \/>\n        &#034;db&#034;: int(os.getenv(&#034;REDIS_DB&#034;, 0)),<br \/>\n        &#034;password&#034;: os.getenv(&#034;REDIS_PASSWORD&#034;, None)<br \/>\n    }<\/p>\n<p>    ELASTICSEARCH_CONFIG: Dict &#061; {<br \/>\n        &#034;hosts&#034;: [os.getenv(&#034;ELASTICSEARCH_HOST&#034;, &#034;http:\/\/localhost:9200&#034;)],<br \/>\n        &#034;index&#034;: &#034;artworks&#034;<br \/>\n    }<\/p>\n<p>    # \u901f\u7387\u9650\u5236\u914d\u7f6e<br \/>\n    RATE_LIMIT_ENABLED: bool &#061; True<br \/>\n    DEFAULT_RATE: int &#061; 5  # \u6bcf\u79d2\u8bf7\u6c42\u6570<br \/>\n    BURST_RATE: int &#061; 10   # \u7a81\u53d1\u8bf7\u6c42\u6570<\/p>\n<p>    # \u722c\u866b\u76ee\u6807\u914d\u7f6e<br \/>\n    TARGET_SITES: Dict[str, Dict] &#061; {<br \/>\n        &#034;wikiart&#034;: {<br \/>\n            &#034;base_url&#034;: &#034;https:\/\/www.wikiart.org&#034;,<br \/>\n            &#034;enabled&#034;: True,<br \/>\n            &#034;rate_limit&#034;: 2,  # \u6bcf\u79d22\u4e2a\u8bf7\u6c42<br \/>\n            &#034;max_pages&#034;: 1000,<br \/>\n            &#034;use_playwright&#034;: False<br \/>\n        },<br \/>\n        &#034;artsy&#034;: {<br \/>\n            &#034;base_url&#034;: &#034;https:\/\/www.artsy.net&#034;,<br \/>\n            &#034;enabled&#034;: True,<br \/>\n            &#034;rate_limit&#034;: 1,  # API\u9650\u5236\u8f83\u4e25\u683c<br \/>\n            &#034;max_pages&#034;: 500,<br \/>\n            &#034;use_playwright&#034;: True,  # \u9700\u8981\u5904\u7406JavaScript<br \/>\n            &#034;api_key&#034;: os.getenv(&#034;ARTSY_API_KEY&#034;, &#034;&#034;)<br \/>\n        },<br \/>\n        &#034;met_museum&#034;: {<br \/>\n            &#034;base_url&#034;: &#034;https:\/\/collectionapi.metmuseum.org&#034;,<br \/>\n            &#034;enabled&#034;: True,<br \/>\n            &#034;rate_limit&#034;: 10,  # \u516c\u5171API\u9650\u5236\u8f83\u5bbd\u677e<br \/>\n            &#034;max_pages&#034;: 2000,<br \/>\n            &#034;use_playwright&#034;: False,<br \/>\n            &#034;api_key&#034;: os.getenv(&#034;MET_API_KEY&#034;, &#034;&#034;)<br \/>\n        }<br \/>\n    }<\/p>\n<p>    # \u65e5\u5fd7\u914d\u7f6e<br \/>\n    LOG_LEVEL: str &#061; &#034;INFO&#034;<br \/>\n    LOG_FORMAT: str &#061; &#034;%(asctime)s &#8211; %(name)s &#8211; %(levelname)s &#8211; %(message)s&#034;<br \/>\n    LOG_ROTATION: str &#061; &#034;1 day&#034;<br \/>\n    LOG_RETENTION: str &#061; &#034;30 days&#034;<\/p>\n<p>    # \u6570\u636e\u6e05\u6d17\u914d\u7f6e<br \/>\n    DEDUPLICATION_ENABLED: bool &#061; True<br \/>\n    DEDUPLICATION_FIELDS: List[str] &#061; [&#034;title&#034;, &#034;artist&#034;, &#034;date&#034;]<br \/>\n    SIMILARITY_THRESHOLD: float &#061; 0.85  # \u76f8\u4f3c\u5ea6\u9608\u503c<\/p>\n<p>    # \u56fe\u50cf\u5904\u7406\u914d\u7f6e<br \/>\n    DOWNLOAD_IMAGES: bool &#061; True<br \/>\n    IMAGE_STORAGE_PATH: Path &#061; DATA_DIR \/ &#034;images&#034;<br \/>\n    THUMBNAIL_SIZE: tuple &#061; (300, 300)<br \/>\n    ALLOWED_IMAGE_FORMATS: List[str] &#061; [&#034;jpg&#034;, &#034;jpeg&#034;, &#034;png&#034;, &#034;gif&#034;, &#034;webp&#034;]<\/p>\n<p>    class Config:<br \/>\n        env_file &#061; &#034;.env&#034;<br \/>\n        case_sensitive &#061; True<\/p>\n<p>    &#064;validator(&#034;LOG_DIR&#034;, &#034;DATA_DIR&#034;, &#034;IMAGE_STORAGE_PATH&#034;)<br \/>\n    def create_directories(cls, v):<br \/>\n        &#034;&#034;&#034;\u81ea\u52a8\u521b\u5efa\u5fc5\u8981\u7684\u76ee\u5f55&#034;&#034;&#034;<br \/>\n        v.mkdir(parents&#061;True, exist_ok&#061;True)<br \/>\n        return v<\/p>\n<p># \u521b\u5efa\u5168\u5c40\u914d\u7f6e\u5b9e\u4f8b<br \/>\nsettings &#061; CrawlerSettings() <\/p>\n<p>3.3.2 \u5e38\u91cf\u5b9a\u4e49 (src\/config\/constants.py)<\/p>\n<p>python<\/p>\n<p>from enum import Enum, auto<br \/>\nfrom typing import Dict, Any<\/p>\n<p>class ArtworkMedium(Enum):<br \/>\n    &#034;&#034;&#034;\u827a\u672f\u4f5c\u54c1\u5a92\u4ecb\u679a\u4e3e&#034;&#034;&#034;<br \/>\n    PAINTING &#061; &#034;\u7ed8\u753b&#034;<br \/>\n    SCULPTURE &#061; &#034;\u96d5\u5851&#034;<br \/>\n    PHOTOGRAPHY &#061; &#034;\u6444\u5f71&#034;<br \/>\n    PRINT &#061; &#034;\u7248\u753b&#034;<br \/>\n    DRAWING &#061; &#034;\u7d20\u63cf&#034;<br \/>\n    INSTALLATION &#061; &#034;\u88c5\u7f6e&#034;<br \/>\n    DIGITAL_ART &#061; &#034;\u6570\u5b57\u827a\u672f&#034;<br \/>\n    MIXED_MEDIA &#061; &#034;\u6df7\u5408\u5a92\u4ecb&#034;<br \/>\n    CERAMICS &#061; &#034;\u9676\u74f7&#034;<br \/>\n    TEXTILE &#061; &#034;\u7eba\u7ec7&#034;<br \/>\n    PERFORMANCE &#061; &#034;\u884c\u4e3a\u827a\u672f&#034;<br \/>\n    VIDEO_ART &#061; &#034;\u89c6\u9891\u827a\u672f&#034;<\/p>\n<p>class ArtworkPeriod(Enum):<br \/>\n    &#034;&#034;&#034;\u827a\u672f\u65f6\u671f\u679a\u4e3e&#034;&#034;&#034;<br \/>\n    ANCIENT &#061; &#034;\u53e4\u4ee3\u827a\u672f&#034;<br \/>\n    MEDIEVAL &#061; &#034;\u4e2d\u4e16\u7eaa\u827a\u672f&#034;<br \/>\n    RENAISSANCE &#061; &#034;\u6587\u827a\u590d\u5174&#034;<br \/>\n    BAROQUE &#061; &#034;\u5df4\u6d1b\u514b&#034;<br \/>\n    ROCOCO &#061; &#034;\u6d1b\u53ef\u53ef&#034;<br \/>\n    NEOCLASSICISM &#061; &#034;\u65b0\u53e4\u5178\u4e3b\u4e49&#034;<br \/>\n    ROMANTICISM &#061; &#034;\u6d6a\u6f2b\u4e3b\u4e49&#034;<br \/>\n    REALISM &#061; &#034;\u73b0\u5b9e\u4e3b\u4e49&#034;<br \/>\n    IMPRESSIONISM &#061; &#034;\u5370\u8c61\u6d3e&#034;<br \/>\n    POST_IMPRESSIONISM &#061; &#034;\u540e\u5370\u8c61\u6d3e&#034;<br \/>\n    MODERN &#061; &#034;\u73b0\u4ee3\u827a\u672f&#034;<br \/>\n    CONTEMPORARY &#061; &#034;\u5f53\u4ee3\u827a\u672f&#034;<\/p>\n<p># \u827a\u672f\u6d41\u6d3e\u6620\u5c04\u5b57\u5178<br \/>\nART_MOVEMENTS: Dict[str, str] &#061; {<br \/>\n    &#034;cubism&#034;: &#034;\u7acb\u4f53\u4e3b\u4e49&#034;,<br \/>\n    &#034;surrealism&#034;: &#034;\u8d85\u73b0\u5b9e\u4e3b\u4e49&#034;,<br \/>\n    &#034;abstract-expressionism&#034;: &#034;\u62bd\u8c61\u8868\u73b0\u4e3b\u4e49&#034;,<br \/>\n    &#034;pop-art&#034;: &#034;\u6ce2\u666e\u827a\u672f&#034;,<br \/>\n    &#034;minimalism&#034;: &#034;\u6781\u7b80\u4e3b\u4e49&#034;,<br \/>\n    &#034;conceptual-art&#034;: &#034;\u6982\u5ff5\u827a\u672f&#034;,<br \/>\n    &#034;fauvism&#034;: &#034;\u91ce\u517d\u6d3e&#034;,<br \/>\n    &#034;expressionism&#034;: &#034;\u8868\u73b0\u4e3b\u4e49&#034;,<br \/>\n    &#034;art-nouveau&#034;: &#034;\u65b0\u827a\u672f\u8fd0\u52a8&#034;,<br \/>\n    &#034;bauhaus&#034;: &#034;\u5305\u8c6a\u65af&#034;,<br \/>\n    &#034;dadaism&#034;: &#034;\u8fbe\u8fbe\u4e3b\u4e49&#034;,<br \/>\n    &#034;futurism&#034;: &#034;\u672a\u6765\u4e3b\u4e49&#034;<br \/>\n}<\/p>\n<p># HTTP\u72b6\u6001\u7801\u542b\u4e49<br \/>\nHTTP_STATUS_CODES: Dict[int, str] &#061; {<br \/>\n    200: &#034;\u6210\u529f&#034;,<br \/>\n    201: &#034;\u5df2\u521b\u5efa&#034;,<br \/>\n    204: &#034;\u65e0\u5185\u5bb9&#034;,<br \/>\n    301: &#034;\u6c38\u4e45\u91cd\u5b9a\u5411&#034;,<br \/>\n    302: &#034;\u4e34\u65f6\u91cd\u5b9a\u5411&#034;,<br \/>\n    400: &#034;\u9519\u8bef\u8bf7\u6c42&#034;,<br \/>\n    401: &#034;\u672a\u6388\u6743&#034;,<br \/>\n    403: &#034;\u7981\u6b62\u8bbf\u95ee&#034;,<br \/>\n    404: &#034;\u672a\u627e\u5230&#034;,<br \/>\n    429: &#034;\u8bf7\u6c42\u8fc7\u591a&#034;,<br \/>\n    500: &#034;\u670d\u52a1\u5668\u5185\u90e8\u9519\u8bef&#034;,<br \/>\n    502: &#034;\u7f51\u5173\u9519\u8bef&#034;,<br \/>\n    503: &#034;\u670d\u52a1\u4e0d\u53ef\u7528&#034;,<br \/>\n    504: &#034;\u7f51\u5173\u8d85\u65f6&#034;<br \/>\n}<\/p>\n<p># \u6570\u636e\u9a8c\u8bc1\u6b63\u5219\u8868\u8fbe\u5f0f<br \/>\nREGEX_PATTERNS: Dict[str, str] &#061; {<br \/>\n    &#034;year&#034;: r&#034;^(1[0-9]{3}|2[0-9]{3})$&#034;,  # 1000-2999\u5e74<br \/>\n    &#034;dimension&#034;: r&#034;^\\\\d&#043;(\\\\.\\\\d&#043;)?\\\\s*[xX]\\\\s*\\\\d&#043;(\\\\.\\\\d&#043;)?\\\\s*(cm|mm|in)$&#034;,  # \u5c3a\u5bf8\u683c\u5f0f<br \/>\n    &#034;artist_name&#034;: r&#034;^[A-Za-z\\\\s\\\\-&#039;.]&#043;$&#034;,  # \u827a\u672f\u5bb6\u59d3\u540d&#xff08;\u7b80\u5316\u7248&#xff09;<br \/>\n    &#034;price&#034;: r&#034;^\\\\$?\\\\d&#043;(,\\\\d{3})*(\\\\.\\\\d{2})?$&#034;  # \u4ef7\u683c\u683c\u5f0f<br \/>\n} <\/p>\n<h4>3.4 Docker\u73af\u5883\u914d\u7f6e<\/h4>\n<p>\u4e3a\u4e86\u786e\u4fdd\u5f00\u53d1\u73af\u5883\u7684\u4e00\u81f4\u6027&#xff0c;\u6211\u4eec\u4f7f\u7528Docker\u6765\u7ba1\u7406\u6240\u6709\u4f9d\u8d56\u670d\u52a1&#xff1a;<\/p>\n<p>yaml<\/p>\n<p># docker\/docker-compose.yml<br \/>\nversion: &#039;3.8&#039;<\/p>\n<p>services:<br \/>\n  mongodb:<br \/>\n    image: mongo:7.0<br \/>\n    container_name: art_mongodb<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;27017:27017&#034;<br \/>\n    environment:<br \/>\n      MONGO_INITDB_ROOT_USERNAME: admin<br \/>\n      MONGO_INITDB_ROOT_PASSWORD: admin123<br \/>\n      MONGO_INITDB_DATABASE: art_database<br \/>\n    volumes:<br \/>\n      &#8211; .\/mongodb\/data:\/data\/db<br \/>\n      &#8211; .\/mongodb\/init.js:\/docker-entrypoint-initdb.d\/init.js:ro<br \/>\n    networks:<br \/>\n      &#8211; art_network<br \/>\n    healthcheck:<br \/>\n      test: [&#034;CMD&#034;, &#034;mongosh&#034;, &#034;&#8211;eval&#034;, &#034;db.adminCommand(&#039;ping&#039;)&#034;]<br \/>\n      interval: 30s<br \/>\n      timeout: 10s<br \/>\n      retries: 3<\/p>\n<p>  postgres:<br \/>\n    image: postgres:16<br \/>\n    container_name: art_postgres<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;5432:5432&#034;<br \/>\n    environment:<br \/>\n      POSTGRES_DB: art_db<br \/>\n      POSTGRES_USER: art_user<br \/>\n      POSTGRES_PASSWORD: art_password<br \/>\n    volumes:<br \/>\n      &#8211; .\/postgres\/data:\/var\/lib\/postgresql\/data<br \/>\n      &#8211; .\/postgres\/init.sql:\/docker-entrypoint-initdb.d\/init.sql:ro<br \/>\n    networks:<br \/>\n      &#8211; art_network<br \/>\n    healthcheck:<br \/>\n      test: [&#034;CMD-SHELL&#034;, &#034;pg_isready -U art_user&#034;]<br \/>\n      interval: 30s<br \/>\n      timeout: 10s<br \/>\n      retries: 3<\/p>\n<p>  redis:<br \/>\n    image: redis:7.2<br \/>\n    container_name: art_redis<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;6379:6379&#034;<br \/>\n    command: redis-server &#8211;requirepass redis_password<br \/>\n    volumes:<br \/>\n      &#8211; .\/redis\/data:\/data<br \/>\n    networks:<br \/>\n      &#8211; art_network<br \/>\n    healthcheck:<br \/>\n      test: [&#034;CMD&#034;, &#034;redis-cli&#034;, &#034;ping&#034;]<br \/>\n      interval: 30s<br \/>\n      timeout: 10s<br \/>\n      retries: 3<\/p>\n<p>  elasticsearch:<br \/>\n    image: elasticsearch:8.11.1<br \/>\n    container_name: art_elasticsearch<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;9200:9200&#034;<br \/>\n    environment:<br \/>\n      &#8211; discovery.type&#061;single-node<br \/>\n      &#8211; xpack.security.enabled&#061;false<br \/>\n      &#8211; &#034;ES_JAVA_OPTS&#061;-Xms512m -Xmx512m&#034;<br \/>\n    volumes:<br \/>\n      &#8211; .\/elasticsearch\/data:\/usr\/share\/elasticsearch\/data<br \/>\n    networks:<br \/>\n      &#8211; art_network<br \/>\n    healthcheck:<br \/>\n      test: [&#034;CMD&#034;, &#034;curl&#034;, &#034;-f&#034;, &#034;http:\/\/localhost:9200&#034;]<br \/>\n      interval: 30s<br \/>\n      timeout: 10s<br \/>\n      retries: 3<\/p>\n<p>  minio:<br \/>\n    image: minio\/minio:RELEASE.2024-01-13T07-53-03Z<br \/>\n    container_name: art_minio<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;9000:9000&#034;<br \/>\n      &#8211; &#034;9001:9001&#034;<br \/>\n    environment:<br \/>\n      MINIO_ROOT_USER: minioadmin<br \/>\n      MINIO_ROOT_PASSWORD: minioadmin123<br \/>\n    volumes:<br \/>\n      &#8211; .\/minio\/data:\/data<br \/>\n    command: server \/data &#8211;console-address &#034;:9001&#034;<br \/>\n    networks:<br \/>\n      &#8211; art_network<br \/>\n    healthcheck:<br \/>\n      test: [&#034;CMD&#034;, &#034;curl&#034;, &#034;-f&#034;, &#034;http:\/\/localhost:9000\/minio\/health\/live&#034;]<br \/>\n      interval: 30s<br \/>\n      timeout: 10s<br \/>\n      retries: 3<\/p>\n<p>  kafka:<br \/>\n    image: confluentinc\/cp-kafka:latest<br \/>\n    container_name: art_kafka<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;9092:9092&#034;<br \/>\n    environment:<br \/>\n      KAFKA_BROKER_ID: 1<br \/>\n      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181<br \/>\n      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT:\/\/localhost:9092<br \/>\n      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1<br \/>\n    depends_on:<br \/>\n      &#8211; zookeeper<br \/>\n    networks:<br \/>\n      &#8211; art_network<\/p>\n<p>  zookeeper:<br \/>\n    image: confluentinc\/cp-zookeeper:latest<br \/>\n    container_name: art_zookeeper<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;2181:2181&#034;<br \/>\n    environment:<br \/>\n      ZOOKEEPER_CLIENT_PORT: 2181<br \/>\n      ZOOKEEPER_TICK_TIME: 2000<br \/>\n    networks:<br \/>\n      &#8211; art_network<\/p>\n<p>  crawler_ui:<br \/>\n    image: apify\/actor-web-scraper:latest<br \/>\n    container_name: art_crawler_ui<br \/>\n    restart: always<br \/>\n    ports:<br \/>\n      &#8211; &#034;8080:8080&#034;<br \/>\n    environment:<br \/>\n      &#8211; APIFY_LOCAL_STORAGE_DIR&#061;\/storage<br \/>\n    volumes:<br \/>\n      &#8211; .\/crawler_ui\/storage:\/storage<br \/>\n    networks:<br \/>\n      &#8211; art_network<\/p>\n<p>networks:<br \/>\n  art_network:<br \/>\n    driver: bridge <\/p>\n<h3>\u56db\u3001\u6838\u5fc3\u722c\u866b\u5b9e\u73b0<\/h3>\n<h4>4.1 \u57fa\u7840\u722c\u866b\u7c7b\u8bbe\u8ba1<\/h4>\n<p>\u6211\u4eec\u9996\u5148\u8bbe\u8ba1\u4e00\u4e2a\u57fa\u7840\u722c\u866b\u7c7b&#xff0c;\u5c01\u88c5\u901a\u7528\u7684\u722c\u866b\u529f\u80fd&#xff1a;<\/p>\n<p>python<\/p>\n<p># src\/crawlers\/base.py<\/p>\n<p>import asyncio<br \/>\nimport logging<br \/>\nimport random<br \/>\nfrom abc import ABC, abstractmethod<br \/>\nfrom typing import Any, Dict, List, Optional, Union<br \/>\nfrom datetime import datetime<br \/>\nfrom urllib.parse import urljoin, urlparse<\/p>\n<p>import aiohttp<br \/>\nfrom aiohttp import ClientTimeout, TCPConnector<br \/>\nfrom tenacity import (<br \/>\n    retry,<br \/>\n    stop_after_attempt,<br \/>\n    wait_exponential,<br \/>\n    retry_if_exception_type,<br \/>\n    before_sleep_log<br \/>\n)<br \/>\nimport backoff<br \/>\nfrom fake_useragent import UserAgent<\/p>\n<p>from src.config.settings import settings<br \/>\nfrom src.utils.logger import setup_logger<br \/>\nfrom src.utils.validator import DataValidator<br \/>\nfrom src.middlewares.rate_limiter import RateLimiter<br \/>\nfrom src.middlewares.proxy import ProxyMiddleware<\/p>\n<p>class BaseCrawler(ABC):<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u57fa\u7840\u722c\u866b\u7c7b&#xff0c;\u63d0\u4f9b\u901a\u7528\u7684\u722c\u866b\u529f\u80fd<br \/>\n    \u6240\u6709\u5177\u4f53\u5e73\u53f0\u7684\u722c\u866b\u90fd\u5e94\u8be5\u7ee7\u627f\u6b64\u7c7b<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self, name: str, config: Dict[str, Any]):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316\u57fa\u7840\u722c\u866b<\/p>\n<p>        Args:<br \/>\n            name: \u722c\u866b\u540d\u79f0<br \/>\n            config: \u722c\u866b\u914d\u7f6e\u5b57\u5178<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.name &#061; name<br \/>\n        self.config &#061; config<br \/>\n        self.logger &#061; setup_logger(f&#034;crawler.{name}&#034;)<\/p>\n<p>        # \u521d\u59cb\u5316\u4e2d\u95f4\u4ef6<br \/>\n        self.rate_limiter &#061; RateLimiter(<br \/>\n            rate&#061;config.get(&#034;rate_limit&#034;, settings.DEFAULT_RATE),<br \/>\n            burst&#061;settings.BURST_RATE<br \/>\n        )<br \/>\n        self.proxy_middleware &#061; ProxyMiddleware() if settings.USE_PROXY else None<\/p>\n<p>        # \u521d\u59cb\u5316\u4f1a\u8bdd\u76f8\u5173\u5c5e\u6027<br \/>\n        self.session: Optional[aiohttp.ClientSession] &#061; None<br \/>\n        self.ua &#061; UserAgent()<\/p>\n<p>        # \u7edf\u8ba1\u4fe1\u606f<br \/>\n        self.stats &#061; {<br \/>\n            &#034;requests_made&#034;: 0,<br \/>\n            &#034;requests_success&#034;: 0,<br \/>\n            &#034;requests_failed&#034;: 0,<br \/>\n            &#034;items_extracted&#034;: 0,<br \/>\n            &#034;start_time&#034;: None,<br \/>\n            &#034;end_time&#034;: None<br \/>\n        }<\/p>\n<p>        # \u521b\u5efa\u8fde\u63a5\u5668<br \/>\n        self.connector &#061; TCPConnector(<br \/>\n            limit&#061;settings.CONCURRENT_REQUESTS,<br \/>\n            ttl_dns_cache&#061;300,<br \/>\n            ssl&#061;False,  # \u67d0\u4e9b\u827a\u672f\u7f51\u7ad9SSL\u8bc1\u4e66\u53ef\u80fd\u6709\u95ee\u9898<br \/>\n            force_close&#061;True<br \/>\n        )<\/p>\n<p>    async def __aenter__(self):<br \/>\n        &#034;&#034;&#034;\u5f02\u6b65\u4e0a\u4e0b\u6587\u7ba1\u7406\u5668\u5165\u53e3&#034;&#034;&#034;<br \/>\n        await self.start()<br \/>\n        return self<\/p>\n<p>    async def __aexit__(self, exc_type, exc_val, exc_tb):<br \/>\n        &#034;&#034;&#034;\u5f02\u6b65\u4e0a\u4e0b\u6587\u7ba1\u7406\u5668\u51fa\u53e3&#034;&#034;&#034;<br \/>\n        await self.close()<\/p>\n<p>    async def start(self):<br \/>\n        &#034;&#034;&#034;\u542f\u52a8\u722c\u866b&#034;&#034;&#034;<br \/>\n        self.logger.info(f&#034;Starting crawler: {self.name}&#034;)<br \/>\n        self.stats[&#034;start_time&#034;] &#061; datetime.now()<\/p>\n<p>        # \u521b\u5efaHTTP\u4f1a\u8bdd<br \/>\n        self.session &#061; aiohttp.ClientSession(<br \/>\n            connector&#061;self.connector,<br \/>\n            timeout&#061;ClientTimeout(total&#061;settings.REQUEST_TIMEOUT),<br \/>\n            headers&#061;self._get_default_headers()<br \/>\n        )<\/p>\n<p>    async def close(self):<br \/>\n        &#034;&#034;&#034;\u5173\u95ed\u722c\u866b&#034;&#034;&#034;<br \/>\n        self.logger.info(f&#034;Closing crawler: {self.name}&#034;)<br \/>\n        self.stats[&#034;end_time&#034;] &#061; datetime.now()<\/p>\n<p>        if self.session and not self.session.closed:<br \/>\n            await self.session.close()<\/p>\n<p>        # \u8bb0\u5f55\u6700\u7ec8\u7edf\u8ba1\u4fe1\u606f<br \/>\n        duration &#061; (self.stats[&#034;end_time&#034;] &#8211; self.stats[&#034;start_time&#034;]).total_seconds()<br \/>\n        self.logger.info(<br \/>\n            f&#034;Crawler stats &#8211; &#034;<br \/>\n            f&#034;Requests: {self.stats[&#039;requests_made&#039;]}, &#034;<br \/>\n            f&#034;Success: {self.stats[&#039;requests_success&#039;]}, &#034;<br \/>\n            f&#034;Failed: {self.stats[&#039;requests_failed&#039;]}, &#034;<br \/>\n            f&#034;Items: {self.stats[&#039;items_extracted&#039;]}, &#034;<br \/>\n            f&#034;Duration: {duration:.2f}s&#034;<br \/>\n        )<\/p>\n<p>    def _get_default_headers(self) -&gt; Dict[str, str]:<br \/>\n        &#034;&#034;&#034;\u83b7\u53d6\u9ed8\u8ba4\u8bf7\u6c42\u5934&#034;&#034;&#034;<br \/>\n        return {<br \/>\n            &#034;User-Agent&#034;: self.ua.random,<br \/>\n            &#034;Accept&#034;: &#034;text\/html,application\/xhtml&#043;xml,application\/xml;q&#061;0.9,image\/webp,*\/*;q&#061;0.8&#034;,<br \/>\n            &#034;Accept-Language&#034;: &#034;en-US,en;q&#061;0.5&#034;,<br \/>\n            &#034;Accept-Encoding&#034;: &#034;gzip, deflate, br&#034;,<br \/>\n            &#034;DNT&#034;: &#034;1&#034;,<br \/>\n            &#034;Connection&#034;: &#034;keep-alive&#034;,<br \/>\n            &#034;Upgrade-Insecure-Requests&#034;: &#034;1&#034;,<br \/>\n            &#034;Sec-Fetch-Dest&#034;: &#034;document&#034;,<br \/>\n            &#034;Sec-Fetch-Mode&#034;: &#034;navigate&#034;,<br \/>\n            &#034;Sec-Fetch-Site&#034;: &#034;none&#034;,<br \/>\n            &#034;Sec-Fetch-User&#034;: &#034;?1&#034;,<br \/>\n            &#034;Cache-Control&#034;: &#034;max-age&#061;0&#034;<br \/>\n        }<\/p>\n<p>    async def _get_proxy(self) -&gt; Optional[str]:<br \/>\n        &#034;&#034;&#034;\u83b7\u53d6\u4ee3\u7406&#034;&#034;&#034;<br \/>\n        if self.proxy_middleware:<br \/>\n            return await self.proxy_middleware.get_proxy()<br \/>\n        return None<\/p>\n<p>    &#064;retry(<br \/>\n        stop&#061;stop_after_attempt(settings.MAX_RETRIES),<br \/>\n        wait&#061;wait_exponential(multiplier&#061;1, min&#061;4, max&#061;10),<br \/>\n        retry&#061;retry_if_exception_type(<br \/>\n            (aiohttp.ClientError, asyncio.TimeoutError, ConnectionError)<br \/>\n        ),<br \/>\n        before_sleep&#061;before_sleep_log(logging.getLogger(), logging.WARNING)<br \/>\n    )<br \/>\n    async def fetch(<br \/>\n        self,<br \/>\n        url: str,<br \/>\n        method: str &#061; &#034;GET&#034;,<br \/>\n        params: Optional[Dict] &#061; None,<br \/>\n        data: Optional[Dict] &#061; None,<br \/>\n        headers: Optional[Dict] &#061; None,<br \/>\n        cookies: Optional[Dict] &#061; None,<br \/>\n        proxy: Optional[str] &#061; None,<br \/>\n        timeout: Optional[int] &#061; None<br \/>\n    ) -&gt; Optional[aiohttp.ClientResponse]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u53d1\u9001HTTP\u8bf7\u6c42<\/p>\n<p>        Args:<br \/>\n            url: \u8bf7\u6c42URL<br \/>\n            method: \u8bf7\u6c42\u65b9\u6cd5<br \/>\n            params: URL\u53c2\u6570<br \/>\n            data: \u8bf7\u6c42\u6570\u636e<br \/>\n            headers: \u8bf7\u6c42\u5934<br \/>\n            cookies: Cookie<br \/>\n            proxy: \u4ee3\u7406<br \/>\n            timeout: \u8d85\u65f6\u65f6\u95f4<\/p>\n<p>        Returns:<br \/>\n            \u54cd\u5e94\u5bf9\u8c61<br \/>\n        &#034;&#034;&#034;<br \/>\n        # \u901f\u7387\u9650\u5236<br \/>\n        await self.rate_limiter.wait_if_needed()<\/p>\n<p>        # \u5408\u5e76\u8bf7\u6c42\u5934<br \/>\n        request_headers &#061; self._get_default_headers()<br \/>\n        if headers:<br \/>\n            request_headers.update(headers)<\/p>\n<p>        # \u83b7\u53d6\u4ee3\u7406<br \/>\n        if not proxy and settings.USE_PROXY:<br \/>\n            proxy &#061; await self._get_proxy()<\/p>\n<p>        self.stats[&#034;requests_made&#034;] &#043;&#061; 1<\/p>\n<p>        try:<br \/>\n            async with self.session.request(<br \/>\n                method&#061;method,<br \/>\n                url&#061;url,<br \/>\n                params&#061;params,<br \/>\n                json&#061;data if method.upper() &#061;&#061; &#034;POST&#034; else None,<br \/>\n                data&#061;data if method.upper() !&#061; &#034;POST&#034; else None,<br \/>\n                headers&#061;request_headers,<br \/>\n                cookies&#061;cookies,<br \/>\n                proxy&#061;proxy,<br \/>\n                timeout&#061;timeout or settings.REQUEST_TIMEOUT,<br \/>\n                ssl&#061;False<br \/>\n            ) as response:<\/p>\n<p>                # \u68c0\u67e5\u54cd\u5e94\u72b6\u6001<br \/>\n                if response.status &#061;&#061; 200:<br \/>\n                    self.stats[&#034;requests_success&#034;] &#043;&#061; 1<br \/>\n                    return response<br \/>\n                elif response.status &#061;&#061; 429:  # \u8bf7\u6c42\u8fc7\u591a<br \/>\n                    retry_after &#061; response.headers.get(&#034;Retry-After&#034;, &#034;60&#034;)<br \/>\n                    self.logger.warning(f&#034;Rate limited. Retry after {retry_after}s&#034;)<br \/>\n                    await asyncio.sleep(int(retry_after))<br \/>\n                    raise aiohttp.ClientError(&#034;Rate limited&#034;)<br \/>\n                elif response.status &gt;&#061; 500:  # \u670d\u52a1\u5668\u9519\u8bef<br \/>\n                    self.logger.error(f&#034;Server error: {response.status}&#034;)<br \/>\n                    raise aiohttp.ClientError(f&#034;Server error: {response.status}&#034;)<br \/>\n                else:<br \/>\n                    self.logger.error(f&#034;HTTP error: {response.status} &#8211; {url}&#034;)<br \/>\n                    self.stats[&#034;requests_failed&#034;] &#043;&#061; 1<br \/>\n                    return None<\/p>\n<p>        except asyncio.TimeoutError:<br \/>\n            self.logger.error(f&#034;Timeout error: {url}&#034;)<br \/>\n            self.stats[&#034;requests_failed&#034;] &#043;&#061; 1<br \/>\n            raise<br \/>\n        except aiohttp.ClientError as e:<br \/>\n            self.logger.error(f&#034;Client error: {str(e)} &#8211; {url}&#034;)<br \/>\n            self.stats[&#034;requests_failed&#034;] &#043;&#061; 1<br \/>\n            raise<br \/>\n        except Exception as e:<br \/>\n            self.logger.error(f&#034;Unexpected error: {str(e)} &#8211; {url}&#034;)<br \/>\n            self.stats[&#034;requests_failed&#034;] &#043;&#061; 1<br \/>\n            raise<\/p>\n<p>    &#064;abstractmethod<br \/>\n    async def parse(self, response: aiohttp.ClientResponse) -&gt; List[Dict[str, Any]]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u89e3\u6790\u54cd\u5e94&#xff08;\u62bd\u8c61\u65b9\u6cd5&#xff0c;\u5b50\u7c7b\u5fc5\u987b\u5b9e\u73b0&#xff09;<\/p>\n<p>        Args:<br \/>\n            response: \u54cd\u5e94\u5bf9\u8c61<\/p>\n<p>        Returns:<br \/>\n            \u89e3\u6790\u540e\u7684\u6570\u636e\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        pass<\/p>\n<p>    &#064;abstractmethod<br \/>\n    async def run(self, *args, **kwargs):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u8fd0\u884c\u722c\u866b&#xff08;\u62bd\u8c61\u65b9\u6cd5&#xff0c;\u5b50\u7c7b\u5fc5\u987b\u5b9e\u73b0&#xff09;<br \/>\n        &#034;&#034;&#034;<br \/>\n        pass<\/p>\n<p>    def validate_data(self, data: Dict[str, Any]) -&gt; bool:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u9a8c\u8bc1\u6570\u636e\u683c\u5f0f<\/p>\n<p>        Args:<br \/>\n            data: \u5f85\u9a8c\u8bc1\u7684\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u662f\u5426\u6709\u6548<br \/>\n        &#034;&#034;&#034;<br \/>\n        return DataValidator.validate_artwork(data)<\/p>\n<p>    async def save_data(self, data: Union[Dict, List[Dict]]):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4fdd\u5b58\u6570\u636e\u5230\u7ba1\u9053<\/p>\n<p>        Args:<br \/>\n            data: \u5f85\u4fdd\u5b58\u7684\u6570\u636e<br \/>\n        &#034;&#034;&#034;<br \/>\n        # \u8fd9\u91cc\u53ef\u4ee5\u53d1\u9001\u5230\u6d88\u606f\u961f\u5217\u6216\u76f4\u63a5\u5b58\u50a8<br \/>\n        # \u540e\u7eed\u4f1a\u5728\u7ba1\u9053\u4e2d\u5b9e\u73b0<br \/>\n        pass<\/p>\n<p>    def normalize_url(self, base_url: str, relative_url: str) -&gt; str:<br \/>\n        &#034;&#034;&#034;\u89c4\u8303\u5316URL&#034;&#034;&#034;<br \/>\n        return urljoin(base_url, relative_url)<\/p>\n<p>    def extract_domain(self, url: str) -&gt; str:<br \/>\n        &#034;&#034;&#034;\u63d0\u53d6URL\u57df\u540d&#034;&#034;&#034;<br \/>\n        parsed &#061; urlparse(url)<br \/>\n        return parsed.netloc <\/p>\n<h4>4.2 \u4e2d\u95f4\u4ef6\u5b9e\u73b0<\/h4>\n<p>4.2.1 \u901f\u7387\u9650\u5236\u5668<\/p>\n<p>python<\/p>\n<p># src\/middlewares\/rate_limiter.py<\/p>\n<p>import asyncio<br \/>\nimport time<br \/>\nfrom collections import deque<br \/>\nfrom typing import Optional<br \/>\nimport threading<\/p>\n<p>class RateLimiter:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u901f\u7387\u9650\u5236\u5668&#xff0c;\u4f7f\u7528\u4ee4\u724c\u6876\u7b97\u6cd5<br \/>\n    \u652f\u6301\u7a81\u53d1\u6d41\u91cf\u548c\u5e73\u5747\u901f\u7387\u63a7\u5236<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self, rate: float, burst: Optional[int] &#061; None):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316\u901f\u7387\u9650\u5236\u5668<\/p>\n<p>        Args:<br \/>\n            rate: \u5e73\u5747\u901f\u7387&#xff08;\u8bf7\u6c42\/\u79d2&#xff09;<br \/>\n            burst: \u7a81\u53d1\u5bb9\u91cf&#xff0c;\u9ed8\u8ba4\u7b49\u4e8erate<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.rate &#061; rate<br \/>\n        self.burst &#061; burst or int(rate)<br \/>\n        self.tokens &#061; self.burst<br \/>\n        self.last_update &#061; time.monotonic()<br \/>\n        self.lock &#061; asyncio.Lock()<\/p>\n<p>    async def wait_if_needed(self):<br \/>\n        &#034;&#034;&#034;\u5982\u679c\u9700\u8981\u5219\u7b49\u5f85&#034;&#034;&#034;<br \/>\n        async with self.lock:<br \/>\n            await self._update_tokens()<\/p>\n<p>            if self.tokens &lt; 1:<br \/>\n                # \u8ba1\u7b97\u9700\u8981\u7b49\u5f85\u7684\u65f6\u95f4<br \/>\n                wait_time &#061; (1 &#8211; self.tokens) \/ self.rate<br \/>\n                await asyncio.sleep(wait_time)<br \/>\n                await self._update_tokens()<\/p>\n<p>            self.tokens -&#061; 1<\/p>\n<p>    async def _update_tokens(self):<br \/>\n        &#034;&#034;&#034;\u66f4\u65b0\u4ee4\u724c\u6570&#034;&#034;&#034;<br \/>\n        now &#061; time.monotonic()<br \/>\n        elapsed &#061; now &#8211; self.last_update<br \/>\n        self.tokens &#061; min(<br \/>\n            self.burst,<br \/>\n            self.tokens &#043; elapsed * self.rate<br \/>\n        )<br \/>\n        self.last_update &#061; now<\/p>\n<p>class SlidingWindowRateLimiter:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u6ed1\u52a8\u7a97\u53e3\u901f\u7387\u9650\u5236\u5668<br \/>\n    \u66f4\u7cbe\u786e\u7684\u901f\u7387\u63a7\u5236&#xff0c;\u9002\u5408\u4e25\u683c\u7684API\u9650\u5236<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self, max_requests: int, window_size: int &#061; 60):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u521d\u59cb\u5316\u6ed1\u52a8\u7a97\u53e3\u9650\u5236\u5668<\/p>\n<p>        Args:<br \/>\n            max_requests: \u7a97\u53e3\u5185\u6700\u5927\u8bf7\u6c42\u6570<br \/>\n            window_size: \u7a97\u53e3\u5927\u5c0f&#xff08;\u79d2&#xff09;<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.max_requests &#061; max_requests<br \/>\n        self.window_size &#061; window_size<br \/>\n        self.requests &#061; deque()<br \/>\n        self.lock &#061; asyncio.Lock()<\/p>\n<p>    async def wait_if_needed(self):<br \/>\n        &#034;&#034;&#034;\u68c0\u67e5\u5e76\u7b49\u5f85\u76f4\u5230\u53ef\u4ee5\u53d1\u9001\u8bf7\u6c42&#034;&#034;&#034;<br \/>\n        async with self.lock:<br \/>\n            now &#061; time.time()<\/p>\n<p>            # \u79fb\u9664\u7a97\u53e3\u5916\u7684\u8bf7\u6c42\u8bb0\u5f55<br \/>\n            while self.requests and self.requests[0] &lt; now &#8211; self.window_size:<br \/>\n                self.requests.popleft()<\/p>\n<p>            # \u5982\u679c\u8fbe\u5230\u9650\u5236&#xff0c;\u7b49\u5f85<br \/>\n            if len(self.requests) &gt;&#061; self.max_requests:<br \/>\n                wait_time &#061; self.requests[0] &#043; self.window_size &#8211; now<br \/>\n                if wait_time &gt; 0:<br \/>\n                    await asyncio.sleep(wait_time)<br \/>\n                # \u7b49\u5f85\u540e\u91cd\u65b0\u6e05\u7406<br \/>\n                while self.requests and self.requests[0] &lt; now &#8211; self.window_size:<br \/>\n                    self.requests.popleft()<\/p>\n<p>            # \u8bb0\u5f55\u5f53\u524d\u8bf7\u6c42<br \/>\n            self.requests.append(now) <\/p>\n<p>4.2.2 \u4ee3\u7406\u4e2d\u95f4\u4ef6<\/p>\n<p>python<\/p>\n<p># src\/middlewares\/proxy.py<\/p>\n<p>import random<br \/>\nimport asyncio<br \/>\nfrom typing import List, Optional<br \/>\nfrom datetime import datetime, timedelta<br \/>\nimport aiohttp<br \/>\nfrom src.config.settings import settings<\/p>\n<p>class ProxyMiddleware:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u4ee3\u7406\u4e2d\u95f4\u4ef6&#xff0c;\u7ba1\u7406\u4ee3\u7406IP\u6c60<br \/>\n    \u652f\u6301\u4ee3\u7406\u9a8c\u8bc1\u3001\u8f6e\u6362\u548c\u5065\u5eb7\u68c0\u67e5<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        self.proxies: List[dict] &#061; []<br \/>\n        self.current_index &#061; 0<br \/>\n        self.last_rotation &#061; datetime.now()<br \/>\n        self.lock &#061; asyncio.Lock()<\/p>\n<p>    async def get_proxy(self) -&gt; Optional[str]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u83b7\u53d6\u4e00\u4e2a\u53ef\u7528\u7684\u4ee3\u7406<\/p>\n<p>        Returns:<br \/>\n            \u4ee3\u7406URL\u6216None<br \/>\n        &#034;&#034;&#034;<br \/>\n        async with self.lock:<br \/>\n            # \u68c0\u67e5\u662f\u5426\u9700\u8981\u8f6e\u6362<br \/>\n            if self._should_rotate():<br \/>\n                await self._rotate_proxy()<\/p>\n<p>            if not self.proxies:<br \/>\n                # \u4ece\u914d\u7f6e\u6587\u4ef6\u6216\u4ee3\u7406\u670d\u52a1\u83b7\u53d6\u4ee3\u7406<br \/>\n                await self._fetch_proxies()<\/p>\n<p>            if self.proxies:<br \/>\n                proxy &#061; self.proxies[self.current_index % len(self.proxies)]<br \/>\n                self.current_index &#043;&#061; 1<br \/>\n                return proxy.get(&#034;url&#034;)<\/p>\n<p>        return None<\/p>\n<p>    async def report_failure(self, proxy_url: str):<br \/>\n        &#034;&#034;&#034;\u62a5\u544a\u4ee3\u7406\u5931\u8d25&#034;&#034;&#034;<br \/>\n        async with self.lock:<br \/>\n            for proxy in self.proxies:<br \/>\n                if proxy[&#034;url&#034;] &#061;&#061; proxy_url:<br \/>\n                    proxy[&#034;failures&#034;] &#061; proxy.get(&#034;failures&#034;, 0) &#043; 1<br \/>\n                    proxy[&#034;last_failure&#034;] &#061; datetime.now()<\/p>\n<p>                    # \u5982\u679c\u5931\u8d25\u6b21\u6570\u8fc7\u591a&#xff0c;\u6682\u65f6\u7981\u7528<br \/>\n                    if proxy[&#034;failures&#034;] &gt;&#061; 3:<br \/>\n                        proxy[&#034;disabled_until&#034;] &#061; datetime.now() &#043; timedelta(minutes&#061;5)<br \/>\n                    break<\/p>\n<p>    def _should_rotate(self) -&gt; bool:<br \/>\n        &#034;&#034;&#034;\u68c0\u67e5\u662f\u5426\u9700\u8981\u8f6e\u6362\u4ee3\u7406&#034;&#034;&#034;<br \/>\n        return (<br \/>\n            datetime.now() &#8211; self.last_rotation<br \/>\n        ).seconds &gt; settings.PROXY_ROTATION_INTERVAL<\/p>\n<p>    async def _rotate_proxy(self):<br \/>\n        &#034;&#034;&#034;\u8f6e\u6362\u4ee3\u7406&#034;&#034;&#034;<br \/>\n        self.current_index &#061; random.randint(0, max(0, len(self.proxies) &#8211; 1))<br \/>\n        self.last_rotation &#061; datetime.now()<\/p>\n<p>    async def _fetch_proxies(self):<br \/>\n        &#034;&#034;&#034;\u4ece\u4ee3\u7406\u670d\u52a1\u83b7\u53d6\u4ee3\u7406\u5217\u8868&#034;&#034;&#034;<br \/>\n        # \u8fd9\u91cc\u53ef\u4ee5\u5b9e\u73b0\u4ece\u514d\u8d39\u4ee3\u7406\u7f51\u7ad9\u6216\u4ed8\u8d39\u4ee3\u7406API\u83b7\u53d6\u4ee3\u7406<br \/>\n        # \u4f8b\u5982&#xff1a;ProxyBroker\u3001Scrapy-Proxy-Pool\u7b49<\/p>\n<p>        # \u793a\u4f8b&#xff1a;\u4ece\u914d\u7f6e\u6587\u4ef6\u8bfb\u53d6\u4ee3\u7406<br \/>\n        if settings.PROXY_POOL:<br \/>\n            for proxy_url in settings.PROXY_POOL:<br \/>\n                self.proxies.append({<br \/>\n                    &#034;url&#034;: proxy_url,<br \/>\n                    &#034;failures&#034;: 0,<br \/>\n                    &#034;last_check&#034;: None,<br \/>\n                    &#034;disabled_until&#034;: None<br \/>\n                })<\/p>\n<p>        # \u5982\u679c\u914d\u7f6e\u4e2d\u6ca1\u6709\u4ee3\u7406&#xff0c;\u53ef\u4ee5\u4f7f\u7528\u514d\u8d39\u7684\u4ee3\u7406API<br \/>\n        try:<br \/>\n            async with aiohttp.ClientSession() as session:<br \/>\n                # \u793a\u4f8b&#xff1a;\u4ece\u514d\u8d39\u4ee3\u7406API\u83b7\u53d6<br \/>\n                async with session.get(<br \/>\n                    &#034;https:\/\/api.proxyscrape.com\/v2\/?request&#061;getproxies&amp;protocol&#061;http&amp;timeout&#061;10000&amp;country&#061;all&amp;ssl&#061;all&amp;anonymity&#061;all&#034;<br \/>\n                ) as response:<br \/>\n                    if response.status &#061;&#061; 200:<br \/>\n                        text &#061; await response.text()<br \/>\n                        proxy_list &#061; text.strip().split(&#039;\\\\r\\\\n&#039;)<br \/>\n                        for proxy in proxy_list[:10]:  # \u9650\u5236\u6570\u91cf<br \/>\n                            self.proxies.append({<br \/>\n                                &#034;url&#034;: f&#034;http:\/\/{proxy}&#034;,<br \/>\n                                &#034;failures&#034;: 0,<br \/>\n                                &#034;last_check&#034;: datetime.now(),<br \/>\n                                &#034;disabled_until&#034;: None<br \/>\n                            })<br \/>\n        except Exception as e:<br \/>\n            print(f&#034;Error fetching proxies: {e}&#034;)<\/p>\n<p>    async def validate_proxy(self, proxy_url: str) -&gt; bool:<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u4ee3\u7406\u662f\u5426\u53ef\u7528&#034;&#034;&#034;<br \/>\n        try:<br \/>\n            async with aiohttp.ClientSession() as session:<br \/>\n                async with session.get(<br \/>\n                    &#034;http:\/\/httpbin.org\/ip&#034;,<br \/>\n                    proxy&#061;proxy_url,<br \/>\n                    timeout&#061;10<br \/>\n                ) as response:<br \/>\n                    if response.status &#061;&#061; 200:<br \/>\n                        return True<br \/>\n        except:<br \/>\n            pass<br \/>\n        return False <\/p>\n<h4>4.3 \u5177\u4f53\u5e73\u53f0\u722c\u866b\u5b9e\u73b0<\/h4>\n<p>4.3.1 WikiArt\u722c\u866b<\/p>\n<p>python<\/p>\n<p># src\/crawlers\/wikiart.py<\/p>\n<p>import json<br \/>\nimport re<br \/>\nfrom typing import List, Dict, Any, Optional<br \/>\nfrom urllib.parse import urlencode<br \/>\nfrom bs4 import BeautifulSoup<br \/>\nimport asyncio<\/p>\n<p>from src.crawlers.base import BaseCrawler<br \/>\nfrom src.config.settings import settings<\/p>\n<p>class WikiArtCrawler(BaseCrawler):<br \/>\n    &#034;&#034;&#034;<br \/>\n    WikiArt\u7f51\u7ad9\u722c\u866b<br \/>\n    \u91c7\u96c6\u827a\u672f\u5bb6\u548c\u827a\u672f\u4f5c\u54c1\u4fe1\u606f<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        super().__init__(<br \/>\n            name&#061;&#034;wikiart&#034;,<br \/>\n            config&#061;settings.TARGET_SITES[&#034;wikiart&#034;]<br \/>\n        )<br \/>\n        self.base_url &#061; self.config[&#034;base_url&#034;]<br \/>\n        self.api_base &#061; &#034;https:\/\/www.wikiart.org\/en\/api\/2&#034;<\/p>\n<p>    async def run(self, max_pages: int &#061; None):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u8fd0\u884c\u722c\u866b<\/p>\n<p>        Args:<br \/>\n            max_pages: \u6700\u5927\u722c\u53d6\u9875\u6570<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.logger.info(&#034;Starting WikiArt crawler&#034;)<\/p>\n<p>        # \u5148\u83b7\u53d6\u827a\u672f\u5bb6\u5217\u8868<br \/>\n        artists &#061; await self._get_artists(max_pages)<\/p>\n<p>        # \u5e76\u53d1\u83b7\u53d6\u6bcf\u4e2a\u827a\u672f\u5bb6\u7684\u4f5c\u54c1<br \/>\n        tasks &#061; []<br \/>\n        for artist in artists[:10]:  # \u5148\u9650\u5236\u6570\u91cf\u6d4b\u8bd5<br \/>\n            task &#061; self._get_artist_artworks(artist)<br \/>\n            tasks.append(task)<\/p>\n<p>        results &#061; await asyncio.gather(*tasks, return_exceptions&#061;True)<\/p>\n<p>        # \u5904\u7406\u7ed3\u679c<br \/>\n        all_artworks &#061; []<br \/>\n        for artist_artworks in results:<br \/>\n            if isinstance(artist_artworks, Exception):<br \/>\n                self.logger.error(f&#034;Error getting artworks: {artist_artworks}&#034;)<br \/>\n            else:<br \/>\n                all_artworks.extend(artist_artworks)<\/p>\n<p>        self.logger.info(f&#034;Collected {len(all_artworks)} artworks&#034;)<\/p>\n<p>        # \u4fdd\u5b58\u6570\u636e<br \/>\n        await self.save_data(all_artworks)<\/p>\n<p>        return all_artworks<\/p>\n<p>    async def _get_artists(self, max_pages: int &#061; None) -&gt; List[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u83b7\u53d6\u827a\u672f\u5bb6\u5217\u8868<\/p>\n<p>        Args:<br \/>\n            max_pages: \u6700\u5927\u9875\u6570<\/p>\n<p>        Returns:<br \/>\n            \u827a\u672f\u5bb6\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        artists &#061; []<br \/>\n        page &#061; 1<br \/>\n        max_pages &#061; max_pages or self.config.get(&#034;max_pages&#034;, 100)<\/p>\n<p>        while page &lt;&#061; max_pages:<br \/>\n            url &#061; f&#034;{self.api_base}\/Artists&#034;<br \/>\n            params &#061; {<br \/>\n                &#034;paginationPage&#034;: page,<br \/>\n                &#034;paginationLimit&#034;: 100,<br \/>\n                &#034;imageLimit&#034;: 1,<br \/>\n                &#034;auth&#034;: &#034;&#034;  # \u53ef\u80fd\u9700\u8981\u8ba4\u8bc1<br \/>\n            }<\/p>\n<p>            self.logger.info(f&#034;Fetching artists page {page}&#034;)<\/p>\n<p>            response &#061; await self.fetch(url, params&#061;params)<br \/>\n            if not response:<br \/>\n                break<\/p>\n<p>            try:<br \/>\n                data &#061; await response.json()<br \/>\n                items &#061; data.get(&#034;data&#034;, [])<\/p>\n<p>                if not items:<br \/>\n                    break<\/p>\n<p>                for item in items:<br \/>\n                    artist &#061; {<br \/>\n                        &#034;name&#034;: item.get(&#034;artistName&#034;),<br \/>\n                        &#034;url&#034;: item.get(&#034;url&#034;),<br \/>\n                        &#034;birth_day&#034;: item.get(&#034;birthDayAsString&#034;),<br \/>\n                        &#034;death_day&#034;: item.get(&#034;deathDayAsString&#034;),<br \/>\n                        &#034;nationality&#034;: item.get(&#034;nationality&#034;),<br \/>\n                        &#034;art_movement&#034;: item.get(&#034;artMovement&#034;),<br \/>\n                        &#034;image&#034;: item.get(&#034;image&#034;),<br \/>\n                        &#034;wikiart_url&#034;: f&#034;{self.base_url}{item.get(&#039;url&#039;)}&#034;<br \/>\n                    }<br \/>\n                    artists.append(artist)<\/p>\n<p>                self.logger.info(f&#034;Found {len(items)} artists on page {page}&#034;)<\/p>\n<p>                # \u68c0\u67e5\u662f\u5426\u8fd8\u6709\u66f4\u591a\u9875<br \/>\n                pagination &#061; data.get(&#034;pagination&#034;, {})<br \/>\n                if page &gt;&#061; pagination.get(&#034;pages&#034;, 0):<br \/>\n                    break<\/p>\n<p>                page &#043;&#061; 1<\/p>\n<p>            except json.JSONDecodeError as e:<br \/>\n                self.logger.error(f&#034;JSON decode error: {e}&#034;)<br \/>\n                break<br \/>\n            except Exception as e:<br \/>\n                self.logger.error(f&#034;Error parsing artists: {e}&#034;)<br \/>\n                break<\/p>\n<p>            # \u907f\u514d\u8bf7\u6c42\u8fc7\u5feb<br \/>\n            await asyncio.sleep(1)<\/p>\n<p>        self.logger.info(f&#034;Total artists collected: {len(artists)}&#034;)<br \/>\n        return artists<\/p>\n<p>    async def _get_artist_artworks(self, artist: Dict) -&gt; List[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u83b7\u53d6\u827a\u672f\u5bb6\u7684\u6240\u6709\u4f5c\u54c1<\/p>\n<p>        Args:<br \/>\n            artist: \u827a\u672f\u5bb6\u4fe1\u606f<\/p>\n<p>        Returns:<br \/>\n            \u4f5c\u54c1\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        artworks &#061; []<br \/>\n        page &#061; 1<\/p>\n<p>        while True:<br \/>\n            url &#061; f&#034;{self.api_base}\/Artists\/{artist[&#039;url&#039;]}\/Paintings&#034;<br \/>\n            params &#061; {<br \/>\n                &#034;paginationPage&#034;: page,<br \/>\n                &#034;paginationLimit&#034;: 100,<br \/>\n                &#034;json&#034;: 2<br \/>\n            }<\/p>\n<p>            self.logger.info(f&#034;Fetching artworks for {artist[&#039;name&#039;]} &#8211; page {page}&#034;)<\/p>\n<p>            response &#061; await self.fetch(url, params&#061;params)<br \/>\n            if not response:<br \/>\n                break<\/p>\n<p>            try:<br \/>\n                data &#061; await response.json()<br \/>\n                items &#061; data.get(&#034;data&#034;, [])<\/p>\n<p>                if not items:<br \/>\n                    break<\/p>\n<p>                for item in items:<br \/>\n                    artwork &#061; {<br \/>\n                        &#034;title&#034;: item.get(&#034;title&#034;),<br \/>\n                        &#034;artist&#034;: artist[&#034;name&#034;],<br \/>\n                        &#034;artist_url&#034;: artist[&#034;url&#034;],<br \/>\n                        &#034;date&#034;: item.get(&#034;completitionYear&#034;),<br \/>\n                        &#034;year&#034;: item.get(&#034;yearAsString&#034;),<br \/>\n                        &#034;medium&#034;: item.get(&#034;technique&#034;),<br \/>\n                        &#034;dimensions&#034;: f&#034;{item.get(&#039;width&#039;, &#039;&#039;)} x {item.get(&#039;height&#039;, &#039;&#039;)}&#034;.strip(),<br \/>\n                        &#034;style&#034;: item.get(&#034;artStyle&#034;),<br \/>\n                        &#034;genre&#034;: item.get(&#034;genre&#034;),<br \/>\n                        &#034;series&#034;: item.get(&#034;series&#034;),<br \/>\n                        &#034;image_url&#034;: item.get(&#034;image&#034;),<br \/>\n                        &#034;wikiart_url&#034;: f&#034;{self.base_url}{item.get(&#039;url&#039;)}&#034;,<br \/>\n                        &#034;description&#034;: item.get(&#034;description&#034;),<br \/>\n                        &#034;museum&#034;: item.get(&#034;museum&#034;, &#034;Unknown&#034;)<br \/>\n                    }<\/p>\n<p>                    # \u6570\u636e\u9a8c\u8bc1<br \/>\n                    if self.validate_data(artwork):<br \/>\n                        artworks.append(artwork)<br \/>\n                        self.stats[&#034;items_extracted&#034;] &#043;&#061; 1<\/p>\n<p>                self.logger.info(f&#034;Found {len(items)} artworks for {artist[&#039;name&#039;]} on page {page}&#034;)<\/p>\n<p>                # \u68c0\u67e5\u662f\u5426\u8fd8\u6709\u66f4\u591a\u9875<br \/>\n                pagination &#061; data.get(&#034;pagination&#034;, {})<br \/>\n                if page &gt;&#061; pagination.get(&#034;pages&#034;, 0):<br \/>\n                    break<\/p>\n<p>                page &#043;&#061; 1<\/p>\n<p>            except json.JSONDecodeError as e:<br \/>\n                self.logger.error(f&#034;JSON decode error: {e}&#034;)<br \/>\n                break<br \/>\n            except Exception as e:<br \/>\n                self.logger.error(f&#034;Error parsing artworks: {e}&#034;)<br \/>\n                break<\/p>\n<p>            await asyncio.sleep(1)<\/p>\n<p>        return artworks<\/p>\n<p>    async def parse(self, response):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u89e3\u6790HTML\u9875\u9762&#xff08;\u7528\u4e8e\u76f4\u63a5\u9875\u9762\u89e3\u6790\u6a21\u5f0f&#xff09;<br \/>\n        &#034;&#034;&#034;<br \/>\n        html &#061; await response.text()<br \/>\n        soup &#061; BeautifulSoup(html, &#039;lxml&#039;)<\/p>\n<p>        # \u793a\u4f8b&#xff1a;\u89e3\u6790\u4f5c\u54c1\u8be6\u60c5\u9875<br \/>\n        artwork &#061; {}<\/p>\n<p>        # \u6807\u9898<br \/>\n        title_elem &#061; soup.find(&#039;h1&#039;, class_&#061;&#039;painting-page-title&#039;)<br \/>\n        if title_elem:<br \/>\n            artwork[&#039;title&#039;] &#061; title_elem.text.strip()<\/p>\n<p>        # \u827a\u672f\u5bb6<br \/>\n        artist_elem &#061; soup.find(&#039;a&#039;, class_&#061;&#039;artist-name&#039;)<br \/>\n        if artist_elem:<br \/>\n            artwork[&#039;artist&#039;] &#061; artist_elem.text.strip()<\/p>\n<p>        # \u5e74\u4efd<br \/>\n        year_elem &#061; soup.find(&#039;span&#039;, itemprop&#061;&#039;dateCreated&#039;)<br \/>\n        if year_elem:<br \/>\n            artwork[&#039;year&#039;] &#061; year_elem.text.strip()<\/p>\n<p>        # \u63cf\u8ff0<br \/>\n        desc_elem &#061; soup.find(&#039;div&#039;, class_&#061;&#039;description&#039;)<br \/>\n        if desc_elem:<br \/>\n            artwork[&#039;description&#039;] &#061; desc_elem.text.strip()<\/p>\n<p>        # \u56fe\u7247<br \/>\n        img_elem &#061; soup.find(&#039;img&#039;, class_&#061;&#039;painting-image&#039;)<br \/>\n        if img_elem and img_elem.get(&#039;src&#039;):<br \/>\n            artwork[&#039;image_url&#039;] &#061; self.normalize_url(<br \/>\n                response.url,<br \/>\n                img_elem[&#039;src&#039;]<br \/>\n            )<\/p>\n<p>        return [artwork] if artwork else [] <\/p>\n<p>4.3.2 Artsy\u722c\u866b&#xff08;\u4f7f\u7528Playwright&#xff09;<\/p>\n<p>python<\/p>\n<p># src\/crawlers\/artsy.py<\/p>\n<p>import asyncio<br \/>\nimport json<br \/>\nfrom typing import List, Dict, Any, Optional<br \/>\nfrom playwright.async_api import async_playwright, Browser, Page<br \/>\nfrom src.crawlers.base import BaseCrawler<br \/>\nfrom src.config.settings import settings<\/p>\n<p>class ArtsyCrawler(BaseCrawler):<br \/>\n    &#034;&#034;&#034;<br \/>\n    Artsy\u7f51\u7ad9\u722c\u866b<br \/>\n    \u4f7f\u7528Playwright\u5904\u7406JavaScript\u6e32\u67d3\u7684\u9875\u9762<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        super().__init__(<br \/>\n            name&#061;&#034;artsy&#034;,<br \/>\n            config&#061;settings.TARGET_SITES[&#034;artsy&#034;]<br \/>\n        )<br \/>\n        self.base_url &#061; self.config[&#034;base_url&#034;]<br \/>\n        self.browser: Optional[Browser] &#061; None<\/p>\n<p>    async def start(self):<br \/>\n        &#034;&#034;&#034;\u542f\u52a8\u722c\u866b&#xff0c;\u521d\u59cb\u5316\u6d4f\u89c8\u5668&#034;&#034;&#034;<br \/>\n        await super().start()<\/p>\n<p>        # \u542f\u52a8Playwright<br \/>\n        self.playwright &#061; await async_playwright().start()<\/p>\n<p>        # \u542f\u52a8\u6d4f\u89c8\u5668<br \/>\n        self.browser &#061; await self.playwright.chromium.launch(<br \/>\n            headless&#061;True,  # \u65e0\u5934\u6a21\u5f0f<br \/>\n            args&#061;[<br \/>\n                &#039;&#8211;disable-blink-features&#061;AutomationControlled&#039;,<br \/>\n                &#039;&#8211;disable-dev-shm-usage&#039;,<br \/>\n                &#039;&#8211;no-sandbox&#039;,<br \/>\n                &#039;&#8211;disable-setuid-sandbox&#039;,<br \/>\n                &#039;&#8211;disable-web-security&#039;,<br \/>\n                &#039;&#8211;disable-features&#061;IsolateOrigins,site-per-process&#039;<br \/>\n            ]<br \/>\n        )<\/p>\n<p>    async def close(self):<br \/>\n        &#034;&#034;&#034;\u5173\u95ed\u722c\u866b&#xff0c;\u5173\u95ed\u6d4f\u89c8\u5668&#034;&#034;&#034;<br \/>\n        if self.browser:<br \/>\n            await self.browser.close()<br \/>\n        if hasattr(self, &#039;playwright&#039;):<br \/>\n            await self.playwright.stop()<br \/>\n        await super().close()<\/p>\n<p>    async def run(self, max_items: int &#061; 100):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u8fd0\u884c\u722c\u866b<\/p>\n<p>        Args:<br \/>\n            max_items: \u6700\u5927\u91c7\u96c6\u6570\u91cf<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.logger.info(&#034;Starting Artsy crawler&#034;)<\/p>\n<p>        # \u521b\u5efa\u65b0\u9875\u9762<br \/>\n        page &#061; await self.browser.new_page()<\/p>\n<p>        try:<br \/>\n            # \u8bbe\u7f6e\u9875\u9762\u89c6\u56fe<br \/>\n            await page.set_viewport_size({&#034;width&#034;: 1920, &#034;height&#034;: 1080})<\/p>\n<p>            # \u8bbe\u7f6e\u7528\u6237\u4ee3\u7406\u548c\u8bf7\u6c42\u5934<br \/>\n            await page.set_extra_http_headers({<br \/>\n                &#034;Accept-Language&#034;: &#034;en-US,en;q&#061;0.9&#034;,<br \/>\n                &#034;Accept-Encoding&#034;: &#034;gzip, deflate, br&#034;,<br \/>\n                &#034;Accept&#034;: &#034;text\/html,application\/xhtml&#043;xml,application\/xml;q&#061;0.9,image\/webp,*\/*;q&#061;0.8&#034;,<br \/>\n                &#034;Connection&#034;: &#034;keep-alive&#034;,<br \/>\n                &#034;Upgrade-Insecure-Requests&#034;: &#034;1&#034;<br \/>\n            })<\/p>\n<p>            # \u8bbf\u95ee\u827a\u672f\u54c1\u5217\u8868\u9875<br \/>\n            url &#061; f&#034;{self.base_url}\/collect&#034;<br \/>\n            self.logger.info(f&#034;Navigating to {url}&#034;)<\/p>\n<p>            await page.goto(url, wait_until&#061;&#034;networkidle&#034;)<\/p>\n<p>            # \u7b49\u5f85\u5185\u5bb9\u52a0\u8f7d<br \/>\n            await page.wait_for_selector(&#034;.artwork-item&#034;, timeout&#061;10000)<\/p>\n<p>            # \u6eda\u52a8\u52a0\u8f7d\u66f4\u591a\u5185\u5bb9<br \/>\n            artworks &#061; []<br \/>\n            previous_height &#061; 0<\/p>\n<p>            while len(artworks) &lt; max_items:<br \/>\n                # \u83b7\u53d6\u5f53\u524d\u9875\u9762\u7684\u827a\u672f\u54c1<br \/>\n                page_artworks &#061; await self._extract_artworks_from_page(page)<br \/>\n                artworks.extend(page_artworks)<\/p>\n<p>                self.logger.info(f&#034;Collected {len(artworks)} artworks so far&#034;)<\/p>\n<p>                # \u68c0\u67e5\u662f\u5426\u8fbe\u5230\u76ee\u6807<br \/>\n                if len(artworks) &gt;&#061; max_items:<br \/>\n                    break<\/p>\n<p>                # \u6eda\u52a8\u5230\u5e95\u90e8\u52a0\u8f7d\u66f4\u591a<br \/>\n                previous_height &#061; await page.evaluate(&#034;document.body.scrollHeight&#034;)<br \/>\n                await page.evaluate(&#034;window.scrollTo(0, document.body.scrollHeight)&#034;)<\/p>\n<p>                # \u7b49\u5f85\u65b0\u5185\u5bb9\u52a0\u8f7d<br \/>\n                await page.wait_for_timeout(2000)<\/p>\n<p>                # \u68c0\u67e5\u662f\u5426\u8fd8\u6709\u66f4\u591a\u5185\u5bb9<br \/>\n                new_height &#061; await page.evaluate(&#034;document.body.scrollHeight&#034;)<br \/>\n                if new_height &#061;&#061; previous_height:<br \/>\n                    self.logger.info(&#034;Reached end of page&#034;)<br \/>\n                    break<\/p>\n<p>            # \u83b7\u53d6\u8be6\u60c5\u9875\u4fe1\u606f<br \/>\n            detailed_artworks &#061; []<br \/>\n            for artwork in artworks[:max_items]:<br \/>\n                if artwork.get(&#034;href&#034;):<br \/>\n                    detail &#061; await self._get_artwork_detail(artwork[&#034;href&#034;])<br \/>\n                    if detail:<br \/>\n                        artwork.update(detail)<br \/>\n                        detailed_artworks.append(artwork)<\/p>\n<p>            self.logger.info(f&#034;Final collected: {len(detailed_artworks)} artworks&#034;)<\/p>\n<p>            # \u4fdd\u5b58\u6570\u636e<br \/>\n            await self.save_data(detailed_artworks)<\/p>\n<p>            return detailed_artworks<\/p>\n<p>        finally:<br \/>\n            await page.close()<\/p>\n<p>    async def _extract_artworks_from_page(self, page: Page) -&gt; List[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4ece\u9875\u9762\u63d0\u53d6\u827a\u672f\u54c1\u4fe1\u606f<\/p>\n<p>        Args:<br \/>\n            page: Playwright\u9875\u9762\u5bf9\u8c61<\/p>\n<p>        Returns:<br \/>\n            \u827a\u672f\u54c1\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        artworks &#061; []<\/p>\n<p>        # \u4f7f\u7528JavaScript\u5728\u9875\u9762\u4e0a\u4e0b\u6587\u4e2d\u63d0\u53d6\u6570\u636e<br \/>\n        data &#061; await page.evaluate(&#034;&#034;&#034;<br \/>\n            () &#061;&gt; {<br \/>\n                const items &#061; document.querySelectorAll(&#039;.artwork-item&#039;);<br \/>\n                const results &#061; [];<\/p>\n<p>                items.forEach(item &#061;&gt; {<br \/>\n                    const link &#061; item.querySelector(&#039;a&#039;);<br \/>\n                    const img &#061; item.querySelector(&#039;img&#039;);<br \/>\n                    const title &#061; item.querySelector(&#039;.artwork-title&#039;);<br \/>\n                    const artist &#061; item.querySelector(&#039;.artwork-artist&#039;);<\/p>\n<p>                    results.push({<br \/>\n                        href: link ? link.href : null,<br \/>\n                        image: img ? img.src : null,<br \/>\n                        title: title ? title.textContent.trim() : null,<br \/>\n                        artist: artist ? artist.textContent.trim() : null<br \/>\n                    });<br \/>\n                });<\/p>\n<p>                return results;<br \/>\n            }<br \/>\n        &#034;&#034;&#034;)<\/p>\n<p>        for item in data:<br \/>\n            if item.get(&#039;href&#039;):<br \/>\n                artwork &#061; {<br \/>\n                    &#039;url&#039;: item[&#039;href&#039;],<br \/>\n                    &#039;title&#039;: item.get(&#039;title&#039;, &#039;&#039;),<br \/>\n                    &#039;artist&#039;: item.get(&#039;artist&#039;, &#039;&#039;),<br \/>\n                    &#039;image_url&#039;: item.get(&#039;image&#039;, &#039;&#039;),<br \/>\n                    &#039;source&#039;: &#039;artsy&#039;<br \/>\n                }<br \/>\n                artworks.append(artwork)<\/p>\n<p>        return artworks<\/p>\n<p>    async def _get_artwork_detail(self, url: str) -&gt; Dict[str, Any]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u83b7\u53d6\u827a\u672f\u54c1\u8be6\u60c5<\/p>\n<p>        Args:<br \/>\n            url: \u8be6\u60c5\u9875URL<\/p>\n<p>        Returns:<br \/>\n            \u8be6\u60c5\u4fe1\u606f<br \/>\n        &#034;&#034;&#034;<br \/>\n        page &#061; await self.browser.new_page()<\/p>\n<p>        try:<br \/>\n            await page.goto(url, wait_until&#061;&#034;networkidle&#034;)<\/p>\n<p>            # \u7b49\u5f85\u4e3b\u8981\u5185\u5bb9\u52a0\u8f7d<br \/>\n            await page.wait_for_selector(&#034;.artwork-detail&#034;, timeout&#061;10000)<\/p>\n<p>            # \u63d0\u53d6\u8be6\u7ec6\u4fe1\u606f<br \/>\n            detail &#061; await page.evaluate(&#034;&#034;&#034;<br \/>\n                () &#061;&gt; {<br \/>\n                    const detail &#061; {};<\/p>\n<p>                    \/\/ \u6807\u9898<br \/>\n                    const title &#061; document.querySelector(&#039;.artwork-title&#039;);<br \/>\n                    if (title) detail.title &#061; title.textContent.trim();<\/p>\n<p>                    \/\/ \u827a\u672f\u5bb6<br \/>\n                    const artist &#061; document.querySelector(&#039;.artwork-artist&#039;);<br \/>\n                    if (artist) detail.artist &#061; artist.textContent.trim();<\/p>\n<p>                    \/\/ \u5e74\u4efd<br \/>\n                    const year &#061; document.querySelector(&#039;.artwork-year&#039;);<br \/>\n                    if (year) detail.year &#061; year.textContent.trim();<\/p>\n<p>                    \/\/ \u6750\u8d28<br \/>\n                    const medium &#061; document.querySelector(&#039;.artwork-medium&#039;);<br \/>\n                    if (medium) detail.medium &#061; medium.textContent.trim();<\/p>\n<p>                    \/\/ \u5c3a\u5bf8<br \/>\n                    const dimensions &#061; document.querySelector(&#039;.artwork-dimensions&#039;);<br \/>\n                    if (dimensions) detail.dimensions &#061; dimensions.textContent.trim();<\/p>\n<p>                    \/\/ \u63cf\u8ff0<br \/>\n                    const description &#061; document.querySelector(&#039;.artwork-description&#039;);<br \/>\n                    if (description) detail.description &#061; description.textContent.trim();<\/p>\n<p>                    \/\/ \u4ef7\u683c<br \/>\n                    const price &#061; document.querySelector(&#039;.artwork-price&#039;);<br \/>\n                    if (price) detail.price &#061; price.textContent.trim();<\/p>\n<p>                    return detail;<br \/>\n                }<br \/>\n            &#034;&#034;&#034;)<\/p>\n<p>            return detail<\/p>\n<p>        except Exception as e:<br \/>\n            self.logger.error(f&#034;Error getting artwork detail from {url}: {e}&#034;)<br \/>\n            return {}<br \/>\n        finally:<br \/>\n            await page.close()<\/p>\n<p>    async def parse(self, response):<br \/>\n        &#034;&#034;&#034;\u5b9e\u73b0\u62bd\u8c61\u65b9\u6cd5&#xff0c;\u4f46Artsy\u4f7f\u7528Playwright&#xff0c;\u6b64\u65b9\u6cd5\u4e0d\u4f1a\u76f4\u63a5\u4f7f\u7528&#034;&#034;&#034;<br \/>\n        return [] <\/p>\n<h3>\u4e94\u3001\u6570\u636e\u5904\u7406\u7ba1\u9053<\/h3>\n<h4>5.1 \u6570\u636e\u9a8c\u8bc1\u5668<\/h4>\n<p>python<\/p>\n<p># src\/pipelines\/validation.py<\/p>\n<p>import re<br \/>\nfrom typing import Dict, Any, List, Optional<br \/>\nfrom datetime import datetime<br \/>\nfrom pydantic import BaseModel, Field, validator, ValidationError<br \/>\nfrom src.config.constants import REGEX_PATTERNS<\/p>\n<p>class ArtworkModel(BaseModel):<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u827a\u672f\u4f5c\u54c1\u6570\u636e\u6a21\u578b&#xff08;\u4f7f\u7528Pydantic v2&#xff09;<br \/>\n    &#034;&#034;&#034;<br \/>\n    # \u57fa\u672c\u4fe1\u606f<br \/>\n    title: str &#061; Field(&#8230;, min_length&#061;1, max_length&#061;500)<br \/>\n    artist: str &#061; Field(&#8230;, min_length&#061;1, max_length&#061;200)<br \/>\n    date: Optional[str] &#061; Field(None, max_length&#061;50)<br \/>\n    year: Optional[int] &#061; Field(None, ge&#061;1000, le&#061;2025)<br \/>\n    medium: Optional[str] &#061; Field(None, max_length&#061;200)<br \/>\n    dimensions: Optional[str] &#061; Field(None, max_length&#061;100)<\/p>\n<p>    # \u5206\u7c7b\u4fe1\u606f<br \/>\n    style: Optional[str] &#061; Field(None, max_length&#061;100)<br \/>\n    genre: Optional[str] &#061; Field(None, max_length&#061;100)<br \/>\n    series: Optional[str] &#061; Field(None, max_length&#061;200)<\/p>\n<p>    # \u6765\u6e90\u4fe1\u606f<br \/>\n    source: str &#061; Field(&#8230;, max_length&#061;50)<br \/>\n    url: Optional[str] &#061; Field(None, max_length&#061;500)<br \/>\n    image_url: Optional[str] &#061; Field(None, max_length&#061;500)<\/p>\n<p>    # \u63cf\u8ff0\u4fe1\u606f<br \/>\n    description: Optional[str] &#061; Field(None, max_length&#061;5000)<\/p>\n<p>    # \u5143\u6570\u636e<br \/>\n    created_at: datetime &#061; Field(default_factory&#061;datetime.now)<br \/>\n    updated_at: datetime &#061; Field(default_factory&#061;datetime.now)<\/p>\n<p>    &#064;validator(&#039;title&#039;)<br \/>\n    def validate_title(cls, v):<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u6807\u9898&#034;&#034;&#034;<br \/>\n        if not v or not v.strip():<br \/>\n            raise ValueError(&#039;Title cannot be empty&#039;)<br \/>\n        return v.strip()<\/p>\n<p>    &#064;validator(&#039;artist&#039;)<br \/>\n    def validate_artist(cls, v):<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u827a\u672f\u5bb6&#034;&#034;&#034;<br \/>\n        if not v or not v.strip():<br \/>\n            raise ValueError(&#039;Artist cannot be empty&#039;)<br \/>\n        return v.strip()<\/p>\n<p>    &#064;validator(&#039;year&#039;)<br \/>\n    def validate_year(cls, v):<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u5e74\u4efd&#034;&#034;&#034;<br \/>\n        if v is not None:<br \/>\n            current_year &#061; datetime.now().year<br \/>\n            if v &lt; 1000 or v &gt; current_year &#043; 5:<br \/>\n                raise ValueError(f&#039;Year must be between 1000 and {current_year &#043; 5}&#039;)<br \/>\n        return v<\/p>\n<p>    &#064;validator(&#039;dimensions&#039;)<br \/>\n    def validate_dimensions(cls, v):<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u5c3a\u5bf8\u683c\u5f0f&#034;&#034;&#034;<br \/>\n        if v:<br \/>\n            # \u68c0\u67e5\u5c3a\u5bf8\u683c\u5f0f&#xff1a;\u6570\u5b57 x \u6570\u5b57 \u5355\u4f4d<br \/>\n            pattern &#061; r&#039;^\\\\d&#043;(\\\\.\\\\d&#043;)?\\\\s*[xX]\\\\s*\\\\d&#043;(\\\\.\\\\d&#043;)?\\\\s*(cm|mm|in|inches)?$&#039;<br \/>\n            if not re.match(pattern, v.strip()):<br \/>\n                # \u5982\u679c\u4e0d\u662f\u6807\u51c6\u683c\u5f0f&#xff0c;\u8bb0\u5f55\u4f46\u4e0d\u629b\u51fa\u5f02\u5e38<br \/>\n                pass<br \/>\n        return v<\/p>\n<p>    &#064;validator(&#039;image_url&#039;)<br \/>\n    def validate_image_url(cls, v):<br \/>\n        &#034;&#034;&#034;\u9a8c\u8bc1\u56fe\u7247URL&#034;&#034;&#034;<br \/>\n        if v:<br \/>\n            url_pattern &#061; r&#039;^https?:\/\/.&#043;\\\\.(jpg|jpeg|png|gif|webp)(\\\\?.*)?$&#039;<br \/>\n            if not re.match(url_pattern, v.lower()):<br \/>\n                # \u5982\u679c\u4e0d\u662f\u56fe\u7247URL&#xff0c;\u8bb0\u5f55\u4f46\u4e0d\u629b\u51fa\u5f02\u5e38<br \/>\n                pass<br \/>\n        return v<\/p>\n<p>class DataValidator:<br \/>\n    &#034;&#034;&#034;\u6570\u636e\u9a8c\u8bc1\u5668&#034;&#034;&#034;<\/p>\n<p>    &#064;staticmethod<br \/>\n    def validate_artwork(data: Dict[str, Any]) -&gt; bool:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u9a8c\u8bc1\u827a\u672f\u4f5c\u54c1\u6570\u636e<\/p>\n<p>        Args:<br \/>\n            data: \u5f85\u9a8c\u8bc1\u7684\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u662f\u5426\u6709\u6548<br \/>\n        &#034;&#034;&#034;<br \/>\n        try:<br \/>\n            # \u4f7f\u7528Pydantic\u6a21\u578b\u9a8c\u8bc1<br \/>\n            artwork &#061; ArtworkModel(**data)<br \/>\n            return True<br \/>\n        except ValidationError as e:<br \/>\n            print(f&#034;Validation error: {e}&#034;)<br \/>\n            return False<br \/>\n        except Exception as e:<br \/>\n            print(f&#034;Unexpected error: {e}&#034;)<br \/>\n            return False<\/p>\n<p>    &#064;staticmethod<br \/>\n    def validate_batch(artworks: List[Dict]) -&gt; tuple[List[Dict], List[Dict]]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6279\u91cf\u9a8c\u8bc1<\/p>\n<p>        Args:<br \/>\n            artworks: \u827a\u672f\u54c1\u6570\u636e\u5217\u8868<\/p>\n<p>        Returns:<br \/>\n            (\u6709\u6548\u6570\u636e, \u65e0\u6548\u6570\u636e)<br \/>\n        &#034;&#034;&#034;<br \/>\n        valid &#061; []<br \/>\n        invalid &#061; []<\/p>\n<p>        for artwork in artworks:<br \/>\n            if DataValidator.validate_artwork(artwork):<br \/>\n                valid.append(artwork)<br \/>\n            else:<br \/>\n                invalid.append(artwork)<\/p>\n<p>        return valid, invalid <\/p>\n<h4>5.2 \u6570\u636e\u53bb\u91cd\u5668<\/h4>\n<p>python<\/p>\n<p># src\/pipelines\/deduplication.py<\/p>\n<p>import hashlib<br \/>\nimport json<br \/>\nfrom typing import Dict, List, Set, Optional<br \/>\nfrom datetime import datetime, timedelta<br \/>\nfrom difflib import SequenceMatcher<br \/>\nimport redis.asyncio as redis<br \/>\nfrom src.config.settings import settings<\/p>\n<p>class DeduplicationPipeline:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u6570\u636e\u53bb\u91cd\u7ba1\u9053<br \/>\n    \u4f7f\u7528Redis\u5b58\u50a8\u6307\u7eb9&#xff0c;\u652f\u6301\u591a\u79cd\u53bb\u91cd\u7b56\u7565<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        self.redis_client &#061; None<br \/>\n        self.seen_keys &#061; set()<br \/>\n        self.fingerprint_cache &#061; {}<\/p>\n<p>    async def initialize(self):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316Redis\u8fde\u63a5&#034;&#034;&#034;<br \/>\n        self.redis_client &#061; redis.Redis(<br \/>\n            host&#061;settings.REDIS_CONFIG[&#039;host&#039;],<br \/>\n            port&#061;settings.REDIS_CONFIG[&#039;port&#039;],<br \/>\n            db&#061;settings.REDIS_CONFIG[&#039;db&#039;],<br \/>\n            password&#061;settings.REDIS_CONFIG.get(&#039;password&#039;),<br \/>\n            decode_responses&#061;True<br \/>\n        )<\/p>\n<p>    async def close(self):<br \/>\n        &#034;&#034;&#034;\u5173\u95edRedis\u8fde\u63a5&#034;&#034;&#034;<br \/>\n        if self.redis_client:<br \/>\n            await self.redis_client.close()<\/p>\n<p>    def generate_fingerprint(self, artwork: Dict, fields: List[str]) -&gt; str:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u751f\u6210\u6570\u636e\u6307\u7eb9<\/p>\n<p>        Args:<br \/>\n            artwork: \u827a\u672f\u54c1\u6570\u636e<br \/>\n            fields: \u7528\u4e8e\u751f\u6210\u6307\u7eb9\u7684\u5b57\u6bb5<\/p>\n<p>        Returns:<br \/>\n            \u6307\u7eb9\u5b57\u7b26\u4e32<br \/>\n        &#034;&#034;&#034;<br \/>\n        # \u63d0\u53d6\u6307\u5b9a\u5b57\u6bb5\u7684\u503c<br \/>\n        values &#061; []<br \/>\n        for field in fields:<br \/>\n            value &#061; artwork.get(field, &#039;&#039;)<br \/>\n            if value:<br \/>\n                # \u6e05\u7406\u548c\u89c4\u8303\u5316<br \/>\n                value &#061; str(value).lower().strip()<br \/>\n                values.append(value)<\/p>\n<p>        # \u7ec4\u5408\u5e76\u751f\u6210\u54c8\u5e0c<br \/>\n        combined &#061; &#039;|&#039;.join(values)<br \/>\n        return hashlib.sha256(combined.encode()).hexdigest()<\/p>\n<p>    async def is_duplicate(self, artwork: Dict) -&gt; bool:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u68c0\u67e5\u662f\u5426\u91cd\u590d<\/p>\n<p>        Args:<br \/>\n            artwork: \u827a\u672f\u54c1\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u662f\u5426\u91cd\u590d<br \/>\n        &#034;&#034;&#034;<br \/>\n        if not settings.DEDUPLICATION_ENABLED:<br \/>\n            return False<\/p>\n<p>        # \u4f7f\u7528\u914d\u7f6e\u7684\u5b57\u6bb5\u751f\u6210\u6307\u7eb9<br \/>\n        fingerprint &#061; self.generate_fingerprint(<br \/>\n            artwork,<br \/>\n            settings.DEDUPLICATION_FIELDS<br \/>\n        )<\/p>\n<p>        # \u68c0\u67e5Redis\u4e2d\u662f\u5426\u5b58\u5728<br \/>\n        if self.redis_client:<br \/>\n            exists &#061; await self.redis_client.exists(f&#034;artwork:{fingerprint}&#034;)<br \/>\n            if exists:<br \/>\n                return True<\/p>\n<p>        # \u68c0\u67e5\u5185\u5b58\u7f13\u5b58<br \/>\n        if fingerprint in self.fingerprint_cache:<br \/>\n            cache_time &#061; self.fingerprint_cache[fingerprint]<br \/>\n            if datetime.now() &#8211; cache_time &lt; timedelta(hours&#061;24):<br \/>\n                return True<\/p>\n<p>        return False<\/p>\n<p>    async def mark_as_seen(self, artwork: Dict):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6807\u8bb0\u6570\u636e\u4e3a\u5df2\u5904\u7406<\/p>\n<p>        Args:<br \/>\n            artwork: \u827a\u672f\u54c1\u6570\u636e<br \/>\n        &#034;&#034;&#034;<br \/>\n        fingerprint &#061; self.generate_fingerprint(<br \/>\n            artwork,<br \/>\n            settings.DEDUPLICATION_FIELDS<br \/>\n        )<\/p>\n<p>        # \u5b58\u5165Redis<br \/>\n        if self.redis_client:<br \/>\n            await self.redis_client.setex(<br \/>\n                f&#034;artwork:{fingerprint}&#034;,<br \/>\n                86400 * 30,  # \u4fdd\u755930\u5929<br \/>\n                json.dumps(artwork, default&#061;str)<br \/>\n            )<\/p>\n<p>        # \u66f4\u65b0\u5185\u5b58\u7f13\u5b58<br \/>\n        self.fingerprint_cache[fingerprint] &#061; datetime.now()<\/p>\n<p>    def calculate_similarity(self, artwork1: Dict, artwork2: Dict) -&gt; float:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u8ba1\u7b97\u4e24\u4e2a\u827a\u672f\u54c1\u7684\u76f8\u4f3c\u5ea6<\/p>\n<p>        Args:<br \/>\n            artwork1: \u827a\u672f\u54c11<br \/>\n            artwork2: \u827a\u672f\u54c12<\/p>\n<p>        Returns:<br \/>\n            \u76f8\u4f3c\u5ea6\u5206\u6570&#xff08;0-1&#xff09;<br \/>\n        &#034;&#034;&#034;<br \/>\n        scores &#061; []<br \/>\n        weights &#061; {<br \/>\n            &#039;title&#039;: 0.3,<br \/>\n            &#039;artist&#039;: 0.3,<br \/>\n            &#039;date&#039;: 0.2,<br \/>\n            &#039;medium&#039;: 0.1,<br \/>\n            &#039;dimensions&#039;: 0.1<br \/>\n        }<\/p>\n<p>        for field, weight in weights.items():<br \/>\n            value1 &#061; str(artwork1.get(field, &#039;&#039;)).lower()<br \/>\n            value2 &#061; str(artwork2.get(field, &#039;&#039;)).lower()<\/p>\n<p>            if value1 and value2:<br \/>\n                similarity &#061; SequenceMatcher(None, value1, value2).ratio()<br \/>\n                scores.append(similarity * weight)<\/p>\n<p>        return sum(scores) if scores else 0<\/p>\n<p>    async def find_similar(self, artwork: Dict, threshold: float &#061; None) -&gt; List[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u67e5\u627e\u76f8\u4f3c\u7684\u827a\u672f\u54c1<\/p>\n<p>        Args:<br \/>\n            artwork: \u76ee\u6807\u827a\u672f\u54c1<br \/>\n            threshold: \u76f8\u4f3c\u5ea6\u9608\u503c<\/p>\n<p>        Returns:<br \/>\n            \u76f8\u4f3c\u827a\u672f\u54c1\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        if threshold is None:<br \/>\n            threshold &#061; settings.SIMILARITY_THRESHOLD<\/p>\n<p>        similar &#061; []<\/p>\n<p>        # \u8fd9\u91cc\u53ef\u4ee5\u4ece\u6570\u636e\u5e93\u67e5\u8be2\u76f8\u4f3c\u6570\u636e<br \/>\n        # \u793a\u4f8b\u4ee3\u7801\u7701\u7565\u5b9e\u9645\u6570\u636e\u5e93\u67e5\u8be2<\/p>\n<p>        return similar<\/p>\n<p>    async def process_batch(self, artworks: List[Dict]) -&gt; tuple[List[Dict], List[Dict]]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6279\u91cf\u5904\u7406\u53bb\u91cd<\/p>\n<p>        Args:<br \/>\n            artworks: \u827a\u672f\u54c1\u5217\u8868<\/p>\n<p>        Returns:<br \/>\n            (\u65b0\u6570\u636e, \u91cd\u590d\u6570\u636e)<br \/>\n        &#034;&#034;&#034;<br \/>\n        new_artworks &#061; []<br \/>\n        duplicate_artworks &#061; []<\/p>\n<p>        for artwork in artworks:<br \/>\n            if await self.is_duplicate(artwork):<br \/>\n                duplicate_artworks.append(artwork)<br \/>\n            else:<br \/>\n                await self.mark_as_seen(artwork)<br \/>\n                new_artworks.append(artwork)<\/p>\n<p>        self.logger.info(<br \/>\n            f&#034;Deduplication: {len(new_artworks)} new, &#034;<br \/>\n            f&#034;{len(duplicate_artworks)} duplicates&#034;<br \/>\n        )<\/p>\n<p>        return new_artworks, duplicate_artworks <\/p>\n<h4>5.3 \u6570\u636e\u589e\u5f3a\u5668<\/h4>\n<p>python<\/p>\n<p># src\/pipelines\/enrichment.py<\/p>\n<p>import re<br \/>\nfrom typing import Dict, Any, Optional<br \/>\nfrom datetime import datetime<br \/>\nimport spacy<br \/>\nfrom PIL import Image<br \/>\nimport requests<br \/>\nfrom io import BytesIO<br \/>\nimport exifread<br \/>\nfrom transformers import pipeline<br \/>\nfrom src.config.constants import ART_MOVEMENTS<\/p>\n<p>class DataEnrichmentPipeline:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u6570\u636e\u589e\u5f3a\u7ba1\u9053<br \/>\n    \u4f7f\u7528NLP\u548c\u56fe\u50cf\u5904\u7406\u6280\u672f\u4e30\u5bcc\u6570\u636e<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        # \u52a0\u8f7dNLP\u6a21\u578b<br \/>\n        try:<br \/>\n            self.nlp &#061; spacy.load(&#034;en_core_web_sm&#034;)<br \/>\n        except:<br \/>\n            # \u5982\u679c\u6a21\u578b\u4e0d\u5b58\u5728&#xff0c;\u4e0b\u8f7d<br \/>\n            import subprocess<br \/>\n            subprocess.run([&#034;python&#034;, &#034;-m&#034;, &#034;spacy&#034;, &#034;download&#034;, &#034;en_core_web_sm&#034;])<br \/>\n            self.nlp &#061; spacy.load(&#034;en_core_web_sm&#034;)<\/p>\n<p>        # \u521d\u59cb\u5316\u56fe\u50cf\u63cf\u8ff0\u6a21\u578b<br \/>\n        self.image_captioner &#061; pipeline(<br \/>\n            &#034;image-to-text&#034;,<br \/>\n            model&#061;&#034;nlpconnect\/vit-gpt2-image-captioning&#034;<br \/>\n        )<\/p>\n<p>        # \u521d\u59cb\u5316\u5b9e\u4f53\u8bc6\u522b\u6a21\u578b<br \/>\n        self.ner &#061; pipeline(<br \/>\n            &#034;ner&#034;,<br \/>\n            model&#061;&#034;dbmdz\/bert-large-cased-finetuned-conll03-english&#034;<br \/>\n        )<\/p>\n<p>    def enrich_artwork(self, artwork: Dict) -&gt; Dict:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u589e\u5f3a\u827a\u672f\u54c1\u6570\u636e<\/p>\n<p>        Args:<br \/>\n            artwork: \u539f\u59cb\u827a\u672f\u54c1\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u589e\u5f3a\u540e\u7684\u6570\u636e<br \/>\n        &#034;&#034;&#034;<br \/>\n        enriched &#061; artwork.copy()<\/p>\n<p>        # 1. \u4ece\u63cf\u8ff0\u4e2d\u63d0\u53d6\u5b9e\u4f53<br \/>\n        if artwork.get(&#039;description&#039;):<br \/>\n            entities &#061; self._extract_entities(artwork[&#039;description&#039;])<br \/>\n            enriched[&#039;extracted_entities&#039;] &#061; entities<\/p>\n<p>        # 2. \u89e3\u6790\u548c\u89c4\u8303\u5316\u65e5\u671f<br \/>\n        if artwork.get(&#039;date&#039;):<br \/>\n            normalized_date &#061; self._normalize_date(artwork[&#039;date&#039;])<br \/>\n            if normalized_date:<br \/>\n                enriched[&#039;normalized_date&#039;] &#061; normalized_date<\/p>\n<p>        # 3. \u5206\u7c7b\u827a\u672f\u6d41\u6d3e<br \/>\n        if artwork.get(&#039;style&#039;):<br \/>\n            movement &#061; self._classify_movement(artwork[&#039;style&#039;])<br \/>\n            if movement:<br \/>\n                enriched[&#039;art_movement&#039;] &#061; movement<\/p>\n<p>        # 4. \u4ece\u56fe\u50cf\u4e2d\u63d0\u53d6\u4fe1\u606f<br \/>\n        if artwork.get(&#039;image_url&#039;):<br \/>\n            image_info &#061; self._extract_image_info(artwork[&#039;image_url&#039;])<br \/>\n            if image_info:<br \/>\n                enriched[&#039;image_metadata&#039;] &#061; image_info<\/p>\n<p>        # 5. \u751f\u6210\u6807\u7b7e<br \/>\n        tags &#061; self._generate_tags(artwork)<br \/>\n        if tags:<br \/>\n            enriched[&#039;tags&#039;] &#061; tags<\/p>\n<p>        # 6. \u6dfb\u52a0\u65f6\u95f4\u6233<br \/>\n        enriched[&#039;enriched_at&#039;] &#061; datetime.now().isoformat()<\/p>\n<p>        return enriched<\/p>\n<p>    def _extract_entities(self, text: str) -&gt; Dict[str, list]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4ece\u6587\u672c\u4e2d\u63d0\u53d6\u5b9e\u4f53<\/p>\n<p>        Args:<br \/>\n            text: \u6587\u672c\u5185\u5bb9<\/p>\n<p>        Returns:<br \/>\n            \u5b9e\u4f53\u5b57\u5178<br \/>\n        &#034;&#034;&#034;<br \/>\n        doc &#061; self.nlp(text)<\/p>\n<p>        entities &#061; {<br \/>\n            &#039;persons&#039;: [],<br \/>\n            &#039;organizations&#039;: [],<br \/>\n            &#039;locations&#039;: [],<br \/>\n            &#039;dates&#039;: [],<br \/>\n            &#039;artworks&#039;: []<br \/>\n        }<\/p>\n<p>        for ent in doc.ents:<br \/>\n            if ent.label_ &#061;&#061; &#039;PERSON&#039;:<br \/>\n                entities[&#039;persons&#039;].append(ent.text)<br \/>\n            elif ent.label_ &#061;&#061; &#039;ORG&#039;:<br \/>\n                entities[&#039;organizations&#039;].append(ent.text)<br \/>\n            elif ent.label_ &#061;&#061; &#039;GPE&#039; or ent.label_ &#061;&#061; &#039;LOC&#039;:<br \/>\n                entities[&#039;locations&#039;].append(ent.text)<br \/>\n            elif ent.label_ &#061;&#061; &#039;DATE&#039;:<br \/>\n                entities[&#039;dates&#039;].append(ent.text)<br \/>\n            elif ent.label_ &#061;&#061; &#039;WORK_OF_ART&#039;:<br \/>\n                entities[&#039;artworks&#039;].append(ent.text)<\/p>\n<p>        return entities<\/p>\n<p>    def _normalize_date(self, date_str: str) -&gt; Optional[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u89c4\u8303\u5316\u65e5\u671f\u683c\u5f0f<\/p>\n<p>        Args:<br \/>\n            date_str: \u539f\u59cb\u65e5\u671f\u5b57\u7b26\u4e32<\/p>\n<p>        Returns:<br \/>\n            \u89c4\u8303\u5316\u540e\u7684\u65e5\u671f\u4fe1\u606f<br \/>\n        &#034;&#034;&#034;<br \/>\n        date_info &#061; {}<\/p>\n<p>        # \u5339\u914d\u5e74\u4efd<br \/>\n        year_pattern &#061; r&#039;\\\\b(1[0-9]{3}|2[0-9]{3})\\\\b&#039;<br \/>\n        years &#061; re.findall(year_pattern, date_str)<\/p>\n<p>        if years:<br \/>\n            date_info[&#039;years&#039;] &#061; [int(y) for y in years]<\/p>\n<p>        # \u5339\u914d\u4e16\u7eaa<br \/>\n        century_pattern &#061; r&#039;(\\\\d&#043;)(st|nd|rd|th)\\\\s&#043;century&#039;<br \/>\n        century_match &#061; re.search(century_pattern, date_str.lower())<br \/>\n        if century_match:<br \/>\n            century &#061; int(century_match.group(1))<br \/>\n            date_info[&#039;century&#039;] &#061; century<br \/>\n            date_info[&#039;century_range&#039;] &#061; [<br \/>\n                (century &#8211; 1) * 100,<br \/>\n                century * 100 &#8211; 1<br \/>\n            ]<\/p>\n<p>        # \u5339\u914d\u8303\u56f4<br \/>\n        range_pattern &#061; r&#039;(\\\\d{4})\\\\s*[-\u2013\u2014]\\\\s*(\\\\d{4})&#039;<br \/>\n        range_match &#061; re.search(range_pattern, date_str)<br \/>\n        if range_match:<br \/>\n            date_info[&#039;date_range&#039;] &#061; [<br \/>\n                int(range_match.group(1)),<br \/>\n                int(range_match.group(2))<br \/>\n            ]<\/p>\n<p>        # \u5c1d\u8bd5\u89e3\u6790\u5177\u4f53\u65e5\u671f<br \/>\n        try:<br \/>\n            # \u5c1d\u8bd5\u591a\u79cd\u65e5\u671f\u683c\u5f0f<br \/>\n            for fmt in [&#039;%Y&#039;, &#039;%Y-%m-%d&#039;, &#039;%d %B %Y&#039;, &#039;%B %d, %Y&#039;]:<br \/>\n                try:<br \/>\n                    parsed_date &#061; datetime.strptime(date_str, fmt)<br \/>\n                    date_info[&#039;iso_date&#039;] &#061; parsed_date.isoformat()<br \/>\n                    break<br \/>\n                except:<br \/>\n                    continue<br \/>\n        except:<br \/>\n            pass<\/p>\n<p>        return date_info if date_info else None<\/p>\n<p>    def _classify_movement(self, style: str) -&gt; Optional[str]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u5206\u7c7b\u827a\u672f\u6d41\u6d3e<\/p>\n<p>        Args:<br \/>\n            style: \u827a\u672f\u98ce\u683c<\/p>\n<p>        Returns:<br \/>\n            \u89c4\u8303\u5316\u540e\u7684\u6d41\u6d3e<br \/>\n        &#034;&#034;&#034;<br \/>\n        style_lower &#061; style.lower()<\/p>\n<p>        # \u76f4\u63a5\u5339\u914d<br \/>\n        for eng, chn in ART_MOVEMENTS.items():<br \/>\n            if eng in style_lower or chn in style_lower:<br \/>\n                return chn<\/p>\n<p>        # \u6a21\u7cca\u5339\u914d<br \/>\n        for eng, chn in ART_MOVEMENTS.items():<br \/>\n            if any(word in style_lower for word in eng.split(&#039;-&#039;)):<br \/>\n                return chn<\/p>\n<p>        return None<\/p>\n<p>    def _extract_image_info(self, image_url: str) -&gt; Optional[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u4ece\u56fe\u50cf\u4e2d\u63d0\u53d6\u4fe1\u606f<\/p>\n<p>        Args:<br \/>\n            image_url: \u56fe\u50cfURL<\/p>\n<p>        Returns:<br \/>\n            \u56fe\u50cf\u4fe1\u606f<br \/>\n        &#034;&#034;&#034;<br \/>\n        try:<br \/>\n            # \u4e0b\u8f7d\u56fe\u50cf<br \/>\n            response &#061; requests.get(image_url, timeout&#061;10)<br \/>\n            img &#061; Image.open(BytesIO(response.content))<\/p>\n<p>            image_info &#061; {<br \/>\n                &#039;format&#039;: img.format,<br \/>\n                &#039;mode&#039;: img.mode,<br \/>\n                &#039;size&#039;: img.size,<br \/>\n                &#039;width&#039;: img.width,<br \/>\n                &#039;height&#039;: img.height,<br \/>\n                &#039;aspect_ratio&#039;: round(img.width \/ img.height, 2)<br \/>\n            }<\/p>\n<p>            # \u63d0\u53d6EXIF\u6570\u636e<br \/>\n            if hasattr(img, &#039;_getexif&#039;) and img._getexif():<br \/>\n                exif_data &#061; {}<br \/>\n                tags &#061; exifread.process_file(BytesIO(response.content))<br \/>\n                for tag, value in tags.items():<br \/>\n                    exif_data[tag] &#061; str(value)<br \/>\n                image_info[&#039;exif&#039;] &#061; exif_data<\/p>\n<p>            # \u751f\u6210\u56fe\u50cf\u63cf\u8ff0<br \/>\n            if img.size[0] * img.size[1] &lt; 1000000:  # \u9650\u5236\u5927\u5c0f\u907f\u514d\u5185\u5b58\u95ee\u9898<br \/>\n                try:<br \/>\n                    caption &#061; self.image_captioner(img)<br \/>\n                    if caption:<br \/>\n                        image_info[&#039;ai_description&#039;] &#061; caption[0][&#039;generated_text&#039;]<br \/>\n                except:<br \/>\n                    pass<\/p>\n<p>            return image_info<\/p>\n<p>        except Exception as e:<br \/>\n            print(f&#034;Error extracting image info: {e}&#034;)<br \/>\n            return None<\/p>\n<p>    def _generate_tags(self, artwork: Dict) -&gt; List[str]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u751f\u6210\u6807\u7b7e<\/p>\n<p>        Args:<br \/>\n            artwork: \u827a\u672f\u54c1\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u6807\u7b7e\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        tags &#061; set()<\/p>\n<p>        # \u57fa\u4e8e\u6807\u9898\u751f\u6210\u6807\u7b7e<br \/>\n        if artwork.get(&#039;title&#039;):<br \/>\n            title_words &#061; re.findall(r&#039;\\\\w&#043;&#039;, artwork[&#039;title&#039;].lower())<br \/>\n            tags.update(title_words[:5])  # \u53d6\u524d5\u4e2a\u8bcd<\/p>\n<p>        # \u57fa\u4e8e\u827a\u672f\u5bb6<br \/>\n        if artwork.get(&#039;artist&#039;):<br \/>\n            artist_parts &#061; artwork[&#039;artist&#039;].lower().split()<br \/>\n            tags.update(artist_parts[:3])<\/p>\n<p>        # \u57fa\u4e8e\u98ce\u683c<br \/>\n        if artwork.get(&#039;style&#039;):<br \/>\n            style_words &#061; artwork[&#039;style&#039;].lower().split()<br \/>\n            tags.update(style_words[:3])<\/p>\n<p>        # \u57fa\u4e8e\u6d41\u6d3e<br \/>\n        if artwork.get(&#039;genre&#039;):<br \/>\n            tags.add(artwork[&#039;genre&#039;].lower())<\/p>\n<p>        # \u8fc7\u6ee4\u6389\u592a\u77ed\u7684\u8bcd<br \/>\n        tags &#061; {tag for tag in tags if len(tag) &gt; 2}<\/p>\n<p>        return list(tags)[:10]  # \u6700\u591a\u8fd4\u56de10\u4e2a\u6807\u7b7e <\/p>\n<h4>5.4 \u6570\u636e\u5b58\u50a8\u5668<\/h4>\n<p>python<\/p>\n<p># src\/pipelines\/storage.py<\/p>\n<p>import asyncio<br \/>\nimport json<br \/>\nfrom typing import Dict, List, Any, Optional<br \/>\nfrom datetime import datetime<br \/>\nfrom motor.motor_asyncio import AsyncIOMotorClient<br \/>\nfrom elasticsearch import AsyncElasticsearch<br \/>\nimport asyncpg<br \/>\nfrom minio import Minio<br \/>\nimport io<\/p>\n<p>from src.config.settings import settings<\/p>\n<p>class StoragePipeline:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u6570\u636e\u5b58\u50a8\u7ba1\u9053<br \/>\n    \u652f\u6301\u591a\u6570\u636e\u5e93\u5b58\u50a8<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        self.mongo_client &#061; None<br \/>\n        self.mongo_db &#061; None<br \/>\n        self.pg_pool &#061; None<br \/>\n        self.es_client &#061; None<br \/>\n        self.minio_client &#061; None<\/p>\n<p>    async def initialize(self):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316\u6240\u6709\u6570\u636e\u5e93\u8fde\u63a5&#034;&#034;&#034;<br \/>\n        # MongoDB\u8fde\u63a5<br \/>\n        self.mongo_client &#061; AsyncIOMotorClient(settings.MONGODB_URI)<br \/>\n        self.mongo_db &#061; self.mongo_client[settings.MONGODB_DB]<\/p>\n<p>        # PostgreSQL\u8fde\u63a5\u6c60<br \/>\n        self.pg_pool &#061; await asyncpg.create_pool(**settings.POSTGRESQL_CONFIG)<\/p>\n<p>        # Elasticsearch\u8fde\u63a5<br \/>\n        self.es_client &#061; AsyncElasticsearch(hosts&#061;settings.ELASTICSEARCH_CONFIG[&#039;hosts&#039;])<\/p>\n<p>        # MinIO\u8fde\u63a5<br \/>\n        self.minio_client &#061; Minio(<br \/>\n            &#039;localhost:9000&#039;,<br \/>\n            access_key&#061;&#039;minioadmin&#039;,<br \/>\n            secret_key&#061;&#039;minioadmin123&#039;,<br \/>\n            secure&#061;False<br \/>\n        )<\/p>\n<p>        # \u786e\u4fddbucket\u5b58\u5728<br \/>\n        if not self.minio_client.bucket_exists(&#039;art-images&#039;):<br \/>\n            self.minio_client.make_bucket(&#039;art-images&#039;)<\/p>\n<p>        # \u521b\u5efa\u6570\u636e\u5e93\u8868<br \/>\n        await self._init_postgres_tables()<\/p>\n<p>        # \u521b\u5efaElasticsearch\u7d22\u5f15<br \/>\n        await self._init_elasticsearch_index()<\/p>\n<p>    async def close(self):<br \/>\n        &#034;&#034;&#034;\u5173\u95ed\u6240\u6709\u6570\u636e\u5e93\u8fde\u63a5&#034;&#034;&#034;<br \/>\n        if self.mongo_client:<br \/>\n            self.mongo_client.close()<\/p>\n<p>        if self.pg_pool:<br \/>\n            await self.pg_pool.close()<\/p>\n<p>        if self.es_client:<br \/>\n            await self.es_client.close()<\/p>\n<p>    async def _init_postgres_tables(self):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316PostgreSQL\u8868&#034;&#034;&#034;<br \/>\n        async with self.pg_pool.acquire() as conn:<br \/>\n            # \u521b\u5efa\u827a\u672f\u5bb6\u8868<br \/>\n            await conn.execute(&#039;&#039;&#039;<br \/>\n                CREATE TABLE IF NOT EXISTS artists (<br \/>\n                    id SERIAL PRIMARY KEY,<br \/>\n                    name VARCHAR(200) NOT NULL,<br \/>\n                    birth_year INTEGER,<br \/>\n                    death_year INTEGER,<br \/>\n                    nationality VARCHAR(100),<br \/>\n                    biography TEXT,<br \/>\n                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,<br \/>\n                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,<br \/>\n                    UNIQUE(name)<br \/>\n                )<br \/>\n            &#039;&#039;&#039;)<\/p>\n<p>            # \u521b\u5efa\u827a\u672f\u54c1\u8868<br \/>\n            await conn.execute(&#039;&#039;&#039;<br \/>\n                CREATE TABLE IF NOT EXISTS artworks (<br \/>\n                    id SERIAL PRIMARY KEY,<br \/>\n                    title VARCHAR(500) NOT NULL,<br \/>\n                    artist_id INTEGER REFERENCES artists(id),<br \/>\n                    year INTEGER,<br \/>\n                    medium VARCHAR(200),<br \/>\n                    dimensions VARCHAR(100),<br \/>\n                    style VARCHAR(100),<br \/>\n                    genre VARCHAR(100),<br \/>\n                    description TEXT,<br \/>\n                    image_url TEXT,<br \/>\n                    source VARCHAR(50),<br \/>\n                    source_url TEXT,<br \/>\n                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,<br \/>\n                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP<br \/>\n                )<br \/>\n            &#039;&#039;&#039;)<\/p>\n<p>            # \u521b\u5efa\u7d22\u5f15<br \/>\n            await conn.execute(&#039;CREATE INDEX IF NOT EXISTS idx_artworks_artist ON artworks(artist_id)&#039;)<br \/>\n            await conn.execute(&#039;CREATE INDEX IF NOT EXISTS idx_artworks_year ON artworks(year)&#039;)<br \/>\n            await conn.execute(&#039;CREATE INDEX IF NOT EXISTS idx_artworks_style ON artworks(style)&#039;)<\/p>\n<p>    async def _init_elasticsearch_index(self):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316Elasticsearch\u7d22\u5f15&#034;&#034;&#034;<br \/>\n        index_name &#061; settings.ELASTICSEARCH_CONFIG[&#039;index&#039;]<\/p>\n<p>        # \u68c0\u67e5\u7d22\u5f15\u662f\u5426\u5b58\u5728<br \/>\n        exists &#061; await self.es_client.indices.exists(index&#061;index_name)<\/p>\n<p>        if not exists:<br \/>\n            # \u521b\u5efa\u7d22\u5f15\u6620\u5c04<br \/>\n            mappings &#061; {<br \/>\n                &#034;mappings&#034;: {<br \/>\n                    &#034;properties&#034;: {<br \/>\n                        &#034;title&#034;: {&#034;type&#034;: &#034;text&#034;, &#034;analyzer&#034;: &#034;standard&#034;},<br \/>\n                        &#034;artist&#034;: {&#034;type&#034;: &#034;text&#034;, &#034;fields&#034;: {&#034;keyword&#034;: {&#034;type&#034;: &#034;keyword&#034;}}},<br \/>\n                        &#034;description&#034;: {&#034;type&#034;: &#034;text&#034;, &#034;analyzer&#034;: &#034;standard&#034;},<br \/>\n                        &#034;year&#034;: {&#034;type&#034;: &#034;integer&#034;},<br \/>\n                        &#034;style&#034;: {&#034;type&#034;: &#034;keyword&#034;},<br \/>\n                        &#034;genre&#034;: {&#034;type&#034;: &#034;keyword&#034;},<br \/>\n                        &#034;medium&#034;: {&#034;type&#034;: &#034;keyword&#034;},<br \/>\n                        &#034;tags&#034;: {&#034;type&#034;: &#034;keyword&#034;},<br \/>\n                        &#034;created_at&#034;: {&#034;type&#034;: &#034;date&#034;}<br \/>\n                    }<br \/>\n                },<br \/>\n                &#034;settings&#034;: {<br \/>\n                    &#034;number_of_shards&#034;: 2,<br \/>\n                    &#034;number_of_replicas&#034;: 1,<br \/>\n                    &#034;analysis&#034;: {<br \/>\n                        &#034;analyzer&#034;: {<br \/>\n                            &#034;art_analyzer&#034;: {<br \/>\n                                &#034;type&#034;: &#034;custom&#034;,<br \/>\n                                &#034;tokenizer&#034;: &#034;standard&#034;,<br \/>\n                                &#034;filter&#034;: [&#034;lowercase&#034;, &#034;stop&#034;, &#034;snowball&#034;]<br \/>\n                            }<br \/>\n                        }<br \/>\n                    }<br \/>\n                }<br \/>\n            }<\/p>\n<p>            await self.es_client.indices.create(<br \/>\n                index&#061;index_name,<br \/>\n                body&#061;mappings<br \/>\n            )<\/p>\n<p>    async def store_artwork(self, artwork: Dict) -&gt; str:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u5b58\u50a8\u5355\u4e2a\u827a\u672f\u54c1<\/p>\n<p>        Args:<br \/>\n            artwork: \u827a\u672f\u54c1\u6570\u636e<\/p>\n<p>        Returns:<br \/>\n            \u5b58\u50a8ID<br \/>\n        &#034;&#034;&#034;<br \/>\n        artwork_id &#061; None<\/p>\n<p>        # 1. \u5b58\u50a8\u5230MongoDB<br \/>\n        mongo_result &#061; await self.mongo_db.artworks.insert_one(artwork)<br \/>\n        artwork_id &#061; str(mongo_result.inserted_id)<\/p>\n<p>        # 2. \u5b58\u50a8\u5230PostgreSQL<br \/>\n        await self._store_to_postgres(artwork)<\/p>\n<p>        # 3. \u7d22\u5f15\u5230Elasticsearch<br \/>\n        await self._index_to_elasticsearch(artwork, artwork_id)<\/p>\n<p>        # 4. \u5b58\u50a8\u56fe\u7247<br \/>\n        if artwork.get(&#039;image_url&#039;):<br \/>\n            await self._store_image(artwork[&#039;image_url&#039;], artwork_id)<\/p>\n<p>        return artwork_id<\/p>\n<p>    async def store_batch(self, artworks: List[Dict]) -&gt; List[str]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6279\u91cf\u5b58\u50a8\u827a\u672f\u54c1<\/p>\n<p>        Args:<br \/>\n            artworks: \u827a\u672f\u54c1\u5217\u8868<\/p>\n<p>        Returns:<br \/>\n            \u5b58\u50a8ID\u5217\u8868<br \/>\n        &#034;&#034;&#034;<br \/>\n        if not artworks:<br \/>\n            return []<\/p>\n<p>        # \u6279\u91cf\u63d2\u5165MongoDB<br \/>\n        result &#061; await self.mongo_db.artworks.insert_many(artworks)<br \/>\n        ids &#061; [str(id) for id in result.inserted_ids]<\/p>\n<p>        # \u6279\u91cf\u63d2\u5165PostgreSQL<br \/>\n        await self._store_batch_to_postgres(artworks)<\/p>\n<p>        # \u6279\u91cf\u7d22\u5f15Elasticsearch<br \/>\n        await self._index_batch_to_elasticsearch(artworks, ids)<\/p>\n<p>        return ids<\/p>\n<p>    async def _store_to_postgres(self, artwork: Dict):<br \/>\n        &#034;&#034;&#034;\u5b58\u50a8\u5230PostgreSQL&#034;&#034;&#034;<br \/>\n        async with self.pg_pool.acquire() as conn:<br \/>\n            async with conn.transaction():<br \/>\n                # \u5148\u5904\u7406\u827a\u672f\u5bb6<br \/>\n                artist_id &#061; await self._get_or_create_artist(conn, artwork.get(&#039;artist&#039;))<\/p>\n<p>                # \u63d2\u5165\u827a\u672f\u54c1<br \/>\n                await conn.execute(&#039;&#039;&#039;<br \/>\n                    INSERT INTO artworks<br \/>\n                    (title, artist_id, year, medium, dimensions, style, genre,<br \/>\n                     description, image_url, source, source_url)<br \/>\n                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)<br \/>\n                    ON CONFLICT (title, artist_id) DO NOTHING<br \/>\n                &#039;&#039;&#039;,<br \/>\n                    artwork.get(&#039;title&#039;),<br \/>\n                    artist_id,<br \/>\n                    artwork.get(&#039;year&#039;),<br \/>\n                    artwork.get(&#039;medium&#039;),<br \/>\n                    artwork.get(&#039;dimensions&#039;),<br \/>\n                    artwork.get(&#039;style&#039;),<br \/>\n                    artwork.get(&#039;genre&#039;),<br \/>\n                    artwork.get(&#039;description&#039;),<br \/>\n                    artwork.get(&#039;image_url&#039;),<br \/>\n                    artwork.get(&#039;source&#039;),<br \/>\n                    artwork.get(&#039;url&#039;)<br \/>\n                )<\/p>\n<p>    async def _get_or_create_artist(self, conn, artist_name: str) -&gt; Optional[int]:<br \/>\n        &#034;&#034;&#034;\u83b7\u53d6\u6216\u521b\u5efa\u827a\u672f\u5bb6&#034;&#034;&#034;<br \/>\n        if not artist_name:<br \/>\n            return None<\/p>\n<p>        # \u67e5\u627e\u73b0\u6709\u827a\u672f\u5bb6<br \/>\n        result &#061; await conn.fetchrow(<br \/>\n            &#039;SELECT id FROM artists WHERE name &#061; $1&#039;,<br \/>\n            artist_name<br \/>\n        )<\/p>\n<p>        if result:<br \/>\n            return result[&#039;id&#039;]<\/p>\n<p>        # \u521b\u5efa\u65b0\u827a\u672f\u5bb6<br \/>\n        result &#061; await conn.fetchrow(<br \/>\n            &#039;INSERT INTO artists (name) VALUES ($1) RETURNING id&#039;,<br \/>\n            artist_name<br \/>\n        )<\/p>\n<p>        return result[&#039;id&#039;]<\/p>\n<p>    async def _store_batch_to_postgres(self, artworks: List[Dict]):<br \/>\n        &#034;&#034;&#034;\u6279\u91cf\u5b58\u50a8\u5230PostgreSQL&#034;&#034;&#034;<br \/>\n        async with self.pg_pool.acquire() as conn:<br \/>\n            async with conn.transaction():<br \/>\n                for artwork in artworks:<br \/>\n                    artist_id &#061; await self._get_or_create_artist(conn, artwork.get(&#039;artist&#039;))<\/p>\n<p>                    await conn.execute(&#039;&#039;&#039;<br \/>\n                        INSERT INTO artworks<br \/>\n                        (title, artist_id, year, medium, dimensions, style, genre,<br \/>\n                         description, image_url, source, source_url)<br \/>\n                        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)<br \/>\n                        ON CONFLICT (title, artist_id) DO NOTHING<br \/>\n                    &#039;&#039;&#039;,<br \/>\n                        artwork.get(&#039;title&#039;),<br \/>\n                        artist_id,<br \/>\n                        artwork.get(&#039;year&#039;),<br \/>\n                        artwork.get(&#039;medium&#039;),<br \/>\n                        artwork.get(&#039;dimensions&#039;),<br \/>\n                        artwork.get(&#039;style&#039;),<br \/>\n                        artwork.get(&#039;genre&#039;),<br \/>\n                        artwork.get(&#039;description&#039;),<br \/>\n                        artwork.get(&#039;image_url&#039;),<br \/>\n                        artwork.get(&#039;source&#039;),<br \/>\n                        artwork.get(&#039;url&#039;)<br \/>\n                    )<\/p>\n<p>    async def _index_to_elasticsearch(self, artwork: Dict, artwork_id: str):<br \/>\n        &#034;&#034;&#034;\u7d22\u5f15\u5230Elasticsearch&#034;&#034;&#034;<br \/>\n        doc &#061; {<br \/>\n            &#039;title&#039;: artwork.get(&#039;title&#039;),<br \/>\n            &#039;artist&#039;: artwork.get(&#039;artist&#039;),<br \/>\n            &#039;description&#039;: artwork.get(&#039;description&#039;),<br \/>\n            &#039;year&#039;: artwork.get(&#039;year&#039;),<br \/>\n            &#039;style&#039;: artwork.get(&#039;style&#039;),<br \/>\n            &#039;genre&#039;: artwork.get(&#039;genre&#039;),<br \/>\n            &#039;medium&#039;: artwork.get(&#039;medium&#039;),<br \/>\n            &#039;tags&#039;: artwork.get(&#039;tags&#039;, []),<br \/>\n            &#039;created_at&#039;: datetime.now().isoformat()<br \/>\n        }<\/p>\n<p>        await self.es_client.index(<br \/>\n            index&#061;settings.ELASTICSEARCH_CONFIG[&#039;index&#039;],<br \/>\n            id&#061;artwork_id,<br \/>\n            body&#061;doc,<br \/>\n            refresh&#061;True<br \/>\n        )<\/p>\n<p>    async def _index_batch_to_elasticsearch(self, artworks: List[Dict], ids: List[str]):<br \/>\n        &#034;&#034;&#034;\u6279\u91cf\u7d22\u5f15\u5230Elasticsearch&#034;&#034;&#034;<br \/>\n        actions &#061; []<\/p>\n<p>        for i, (artwork, artwork_id) in enumerate(zip(artworks, ids)):<br \/>\n            doc &#061; {<br \/>\n                &#039;title&#039;: artwork.get(&#039;title&#039;),<br \/>\n                &#039;artist&#039;: artwork.get(&#039;artist&#039;),<br \/>\n                &#039;description&#039;: artwork.get(&#039;description&#039;),<br \/>\n                &#039;year&#039;: artwork.get(&#039;year&#039;),<br \/>\n                &#039;style&#039;: artwork.get(&#039;style&#039;),<br \/>\n                &#039;genre&#039;: artwork.get(&#039;genre&#039;),<br \/>\n                &#039;medium&#039;: artwork.get(&#039;medium&#039;),<br \/>\n                &#039;tags&#039;: artwork.get(&#039;tags&#039;, []),<br \/>\n                &#039;created_at&#039;: datetime.now().isoformat()<br \/>\n            }<\/p>\n<p>            action &#061; {<br \/>\n                &#039;_index&#039;: settings.ELASTICSEARCH_CONFIG[&#039;index&#039;],<br \/>\n                &#039;_id&#039;: artwork_id,<br \/>\n                &#039;_source&#039;: doc<br \/>\n            }<br \/>\n            actions.append(action)<\/p>\n<p>        if actions:<br \/>\n            await self.es_client.bulk(body&#061;actions, refresh&#061;True)<\/p>\n<p>    async def _store_image(self, image_url: str, artwork_id: str):<br \/>\n        &#034;&#034;&#034;\u5b58\u50a8\u56fe\u7247\u5230MinIO&#034;&#034;&#034;<br \/>\n        try:<br \/>\n            # \u4e0b\u8f7d\u56fe\u7247<br \/>\n            import aiohttp<br \/>\n            async with aiohttp.ClientSession() as session:<br \/>\n                async with session.get(image_url) as response:<br \/>\n                    if response.status &#061;&#061; 200:<br \/>\n                        data &#061; await response.read()<\/p>\n<p>                        # \u4e0a\u4f20\u5230MinIO<br \/>\n                        self.minio_client.put_object(<br \/>\n                            &#039;art-images&#039;,<br \/>\n                            f&#039;{artwork_id}.jpg&#039;,<br \/>\n                            io.BytesIO(data),<br \/>\n                            length&#061;len(data),<br \/>\n                            content_type&#061;&#039;image\/jpeg&#039;<br \/>\n                        )<br \/>\n        except Exception as e:<br \/>\n            print(f&#034;Error storing image: {e}&#034;)<\/p>\n<p>    async def search_artworks(self, query: str, size: int &#061; 10) -&gt; List[Dict]:<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u641c\u7d22\u827a\u672f\u54c1<\/p>\n<p>        Args:<br \/>\n            query: \u641c\u7d22\u5173\u952e\u8bcd<br \/>\n            size: \u8fd4\u56de\u6570\u91cf<\/p>\n<p>        Returns:<br \/>\n            \u641c\u7d22\u7ed3\u679c<br \/>\n        &#034;&#034;&#034;<br \/>\n        search_body &#061; {<br \/>\n            &#034;query&#034;: {<br \/>\n                &#034;multi_match&#034;: {<br \/>\n                    &#034;query&#034;: query,<br \/>\n                    &#034;fields&#034;: [&#034;title^3&#034;, &#034;artist^2&#034;, &#034;description&#034;, &#034;tags&#034;]<br \/>\n                }<br \/>\n            },<br \/>\n            &#034;size&#034;: size<br \/>\n        }<\/p>\n<p>        result &#061; await self.es_client.search(<br \/>\n            index&#061;settings.ELASTICSEARCH_CONFIG[&#039;index&#039;],<br \/>\n            body&#061;search_body<br \/>\n        )<\/p>\n<p>        return [hit[&#039;_source&#039;] for hit in result[&#039;hits&#039;][&#039;hits&#039;]] <\/p>\n<h3>\u516d\u3001\u8c03\u5ea6\u4e0e\u76d1\u63a7<\/h3>\n<h4>6.1 \u4efb\u52a1\u8c03\u5ea6\u5668<\/h4>\n<p>python<\/p>\n<p># src\/scheduler.py<\/p>\n<p>import asyncio<br \/>\nimport signal<br \/>\nfrom typing import Dict, List, Optional<br \/>\nfrom datetime import datetime, timedelta<br \/>\nimport aioredis<br \/>\nfrom apscheduler.schedulers.asyncio import AsyncIOScheduler<br \/>\nfrom apscheduler.triggers.interval import IntervalTrigger<br \/>\nfrom apscheduler.triggers.cron import CronTrigger<br \/>\nimport json<\/p>\n<p>from src.crawlers.wikiart import WikiArtCrawler<br \/>\nfrom src.crawlers.artsy import ArtsyCrawler<br \/>\nfrom src.pipelines.storage import StoragePipeline<br \/>\nfrom src.pipelines.deduplication import DeduplicationPipeline<br \/>\nfrom src.pipelines.enrichment import DataEnrichmentPipeline<br \/>\nfrom src.utils.logger import setup_logger<\/p>\n<p>class CrawlerScheduler:<br \/>\n    &#034;&#034;&#034;<br \/>\n    \u722c\u866b\u4efb\u52a1\u8c03\u5ea6\u5668<br \/>\n    \u7ba1\u7406\u6240\u6709\u722c\u866b\u7684\u6267\u884c\u3001\u76d1\u63a7\u548c\u9519\u8bef\u5904\u7406<br \/>\n    &#034;&#034;&#034;<\/p>\n<p>    def __init__(self):<br \/>\n        self.logger &#061; setup_logger(&#034;scheduler&#034;)<br \/>\n        self.scheduler &#061; AsyncIOScheduler()<br \/>\n        self.running &#061; False<br \/>\n        self.redis &#061; None<\/p>\n<p>        # \u4efb\u52a1\u72b6\u6001<br \/>\n        self.task_status &#061; {}<\/p>\n<p>        # \u722c\u866b\u5b9e\u4f8b<br \/>\n        self.crawlers &#061; {<br \/>\n            &#039;wikiart&#039;: WikiArtCrawler,<br \/>\n            &#039;artsy&#039;: ArtsyCrawler<br \/>\n        }<\/p>\n<p>    async def initialize(self):<br \/>\n        &#034;&#034;&#034;\u521d\u59cb\u5316\u8c03\u5ea6\u5668&#034;&#034;&#034;<br \/>\n        # \u8fde\u63a5Redis<br \/>\n        self.redis &#061; await aioredis.from_url(<br \/>\n            f&#034;redis:\/\/{settings.REDIS_CONFIG[&#039;host&#039;]}:{settings.REDIS_CONFIG[&#039;port&#039;]}&#034;,<br \/>\n            password&#061;settings.REDIS_CONFIG.get(&#039;password&#039;),<br \/>\n            decode_responses&#061;True<br \/>\n        )<\/p>\n<p>        # \u6062\u590d\u4efb\u52a1\u72b6\u6001<br \/>\n        await self._load_task_status()<\/p>\n<p>        # \u8bbe\u7f6e\u4fe1\u53f7\u5904\u7406<br \/>\n        loop &#061; asyncio.get_event_loop()<br \/>\n        for sig in (signal.SIGTERM, signal.SIGINT):<br \/>\n            loop.add_signal_handler(sig, lambda: asyncio.create_task(self.shutdown()))<\/p>\n<p>    async def _load_task_status(self):<br \/>\n        &#034;&#034;&#034;\u4eceRedis\u52a0\u8f7d\u4efb\u52a1\u72b6\u6001&#034;&#034;&#034;<br \/>\n        keys &#061; await self.redis.keys(&#034;task:*&#034;)<br \/>\n        for key in keys:<br \/>\n            status &#061; await self.redis.get(key)<br \/>\n            if status:<br \/>\n                self.task_status[key[5:]] &#061; json.loads(status)<\/p>\n<p>    async def save_task_status(self, task_name: str):<br \/>\n        &#034;&#034;&#034;\u4fdd\u5b58\u4efb\u52a1\u72b6\u6001\u5230Redis&#034;&#034;&#034;<br \/>\n        if task_name in self.task_status:<br \/>\n            await self.redis.setex(<br \/>\n                f&#034;task:{task_name}&#034;,<br \/>\n                86400 * 7,  # \u4fdd\u75597\u5929<br \/>\n                json.dumps(self.task_status[task_name], default&#061;str)<br \/>\n            )<\/p>\n<p>    def add_crawler_task(<br \/>\n        self,<br \/>\n        crawler_name: str,<br \/>\n        schedule_type: str &#061; &#034;interval&#034;,<br \/>\n        **kwargs<br \/>\n    ):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u6dfb\u52a0\u722c\u866b\u4efb\u52a1<\/p>\n<p>        Args:<br \/>\n            crawler_name: \u722c\u866b\u540d\u79f0<br \/>\n            schedule_type: \u8c03\u5ea6\u7c7b\u578b&#xff08;interval\/cron&#xff09;<br \/>\n            **kwargs: \u8c03\u5ea6\u53c2\u6570<br \/>\n        &#034;&#034;&#034;<br \/>\n        if crawler_name not in self.crawlers:<br \/>\n            self.logger.error(f&#034;Unknown crawler: {crawler_name}&#034;)<br \/>\n            return<\/p>\n<p>        task_id &#061; f&#034;{crawler_name}_task&#034;<\/p>\n<p>        # \u6839\u636e\u8c03\u5ea6\u7c7b\u578b\u521b\u5efa\u89e6\u53d1\u5668<br \/>\n        if schedule_type &#061;&#061; &#034;interval&#034;:<br \/>\n            trigger &#061; IntervalTrigger(<br \/>\n                minutes&#061;kwargs.get(&#039;minutes&#039;, 60),<br \/>\n                hours&#061;kwargs.get(&#039;hours&#039;, 0),<br \/>\n                days&#061;kwargs.get(&#039;days&#039;, 0)<br \/>\n            )<br \/>\n        elif schedule_type &#061;&#061; &#034;cron&#034;:<br \/>\n            trigger &#061; CronTrigger(<br \/>\n                hour&#061;kwargs.get(&#039;hour&#039;, &#039;*&#039;),<br \/>\n                minute&#061;kwargs.get(&#039;minute&#039;, &#039;0&#039;),<br \/>\n                day&#061;kwargs.get(&#039;day&#039;, &#039;*&#039;),<br \/>\n                month&#061;kwargs.get(&#039;month&#039;, &#039;*&#039;),<br \/>\n                day_of_week&#061;kwargs.get(&#039;day_of_week&#039;, &#039;*&#039;)<br \/>\n            )<br \/>\n        else:<br \/>\n            self.logger.error(f&#034;Unknown schedule type: {schedule_type}&#034;)<br \/>\n            return<\/p>\n<p>        # \u6dfb\u52a0\u4efb\u52a1<br \/>\n        self.scheduler.add_job(<br \/>\n            func&#061;self.run_crawler,<br \/>\n            trigger&#061;trigger,<br \/>\n            args&#061;[crawler_name],<br \/>\n            id&#061;task_id,<br \/>\n            name&#061;f&#034;Run {crawler_name} crawler&#034;,<br \/>\n            replace_existing&#061;True,<br \/>\n            misfire_grace_time&#061;300<br \/>\n        )<\/p>\n<p>        self.logger.info(f&#034;Added crawler task: {crawler_name} ({schedule_type})&#034;)<\/p>\n<p>        # \u521d\u59cb\u5316\u4efb\u52a1\u72b6\u6001<br \/>\n        self.task_status[crawler_name] &#061; {<br \/>\n            &#039;last_run&#039;: None,<br \/>\n            &#039;last_success&#039;: None,<br \/>\n            &#039;last_error&#039;: None,<br \/>\n            &#039;total_runs&#039;: 0,<br \/>\n            &#039;successful_runs&#039;: 0,<br \/>\n            &#039;failed_runs&#039;: 0,<br \/>\n            &#039;total_items&#039;: 0<br \/>\n        }<\/p>\n<p>    async def run_crawler(self, crawler_name: str):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u8fd0\u884c\u722c\u866b<\/p>\n<p>        Args:<br \/>\n            crawler_name: \u722c\u866b\u540d\u79f0<br \/>\n        &#034;&#034;&#034;<br \/>\n        self.logger.info(f&#034;Starting crawler: {crawler_name}&#034;)<\/p>\n<p>        # \u66f4\u65b0\u72b6\u6001<br \/>\n        self.task_status[crawler_name][&#039;last_run&#039;] &#061; datetime.now()<br \/>\n        self.task_status[crawler_name][&#039;total_runs&#039;] &#043;&#061; 1<\/p>\n<p>        crawler_class &#061; self.crawlers[crawler_name]<\/p>\n<p>        try:<br \/>\n            # \u521b\u5efa\u722c\u866b\u5b9e\u4f8b<br \/>\n            async with crawler_class() as crawler:<br \/>\n                # \u8fd0\u884c\u722c\u866b<br \/>\n                artworks &#061; await crawler.run()<\/p>\n<p>                if artworks:<br \/>\n                    # \u53bb\u91cd<br \/>\n                    dedup_pipeline &#061; DeduplicationPipeline()<br \/>\n                    await dedup_pipeline.initialize()<\/p>\n<p>                    new_artworks, duplicates &#061; await dedup_pipeline.process_batch(artworks)<\/p>\n<p>                    # \u6570\u636e\u589e\u5f3a<br \/>\n                    enrich_pipeline &#061; DataEnrichmentPipeline()<br \/>\n                    enriched_artworks &#061; [<br \/>\n                        enrich_pipeline.enrich_artwork(artwork)<br \/>\n                        for artwork in new_artworks<br \/>\n                    ]<\/p>\n<p>                    # \u5b58\u50a8<br \/>\n                    storage_pipeline &#061; StoragePipeline()<br \/>\n                    await storage_pipeline.initialize()<\/p>\n<p>                    ids &#061; await storage_pipeline.store_batch(enriched_artworks)<\/p>\n<p>                    # \u5173\u95ed\u7ba1\u9053<br \/>\n                    await dedup_pipeline.close()<br \/>\n                    await storage_pipeline.close()<\/p>\n<p>                    # \u66f4\u65b0\u7edf\u8ba1<br \/>\n                    self.task_status[crawler_name][&#039;total_items&#039;] &#043;&#061; len(ids)<\/p>\n<p>                self.logger.info(<br \/>\n                    f&#034;Crawler {crawler_name} completed. &#034;<br \/>\n                    f&#034;Collected: {len(artworks)}, &#034;<br \/>\n                    f&#034;New: {len(new_artworks)}, &#034;<br \/>\n                    f&#034;Duplicates: {len(duplicates)}&#034;<br \/>\n                )<\/p>\n<p>                # \u66f4\u65b0\u6210\u529f\u72b6\u6001<br \/>\n                self.task_status[crawler_name][&#039;last_success&#039;] &#061; datetime.now()<br \/>\n                self.task_status[crawler_name][&#039;successful_runs&#039;] &#043;&#061; 1<\/p>\n<p>        except Exception as e:<br \/>\n            self.logger.error(f&#034;Crawler {crawler_name} failed: {e}&#034;, exc_info&#061;True)<\/p>\n<p>            # \u66f4\u65b0\u5931\u8d25\u72b6\u6001<br \/>\n            self.task_status[crawler_name][&#039;last_error&#039;] &#061; {<br \/>\n                &#039;time&#039;: datetime.now(),<br \/>\n                &#039;error&#039;: str(e)<br \/>\n            }<br \/>\n            self.task_status[crawler_name][&#039;failed_runs&#039;] &#043;&#061; 1<\/p>\n<p>        finally:<br \/>\n            # \u4fdd\u5b58\u72b6\u6001<br \/>\n            await self.save_task_status(crawler_name)<\/p>\n<p>    async def run_all_crawlers(self):<br \/>\n        &#034;&#034;&#034;\u8fd0\u884c\u6240\u6709\u722c\u866b&#034;&#034;&#034;<br \/>\n        tasks &#061; []<br \/>\n        for crawler_name in self.crawlers:<br \/>\n            task &#061; self.run_crawler(crawler_name)<br \/>\n            tasks.append(task)<\/p>\n<p>        await asyncio.gather(*tasks, return_exceptions&#061;True)<\/p>\n<p>    async def get_status(self) -&gt; Dict:<br \/>\n        &#034;&#034;&#034;\u83b7\u53d6\u6240\u6709\u4efb\u52a1\u72b6\u6001&#034;&#034;&#034;<br \/>\n        return {<br \/>\n            &#039;running&#039;: self.running,<br \/>\n            &#039;tasks&#039;: self.task_status,<br \/>\n            &#039;scheduler_jobs&#039;: [<br \/>\n                {<br \/>\n                    &#039;id&#039;: job.id,<br \/>\n                    &#039;name&#039;: job.name,<br \/>\n                    &#039;next_run&#039;: job.next_run_time.isoformat() if job.next_run_time else None<br \/>\n                }<br \/>\n                for job in self.scheduler.get_jobs()<br \/>\n            ]<br \/>\n        }<\/p>\n<p>    def start(self):<br \/>\n        &#034;&#034;&#034;\u542f\u52a8\u8c03\u5ea6\u5668&#034;&#034;&#034;<br \/>\n        self.scheduler.start()<br \/>\n        self.running &#061; True<br \/>\n        self.logger.info(&#034;Scheduler started&#034;)<\/p>\n<p>    async def shutdown(self):<br \/>\n        &#034;&#034;&#034;\u5173\u95ed\u8c03\u5ea6\u5668&#034;&#034;&#034;<br \/>\n        self.logger.info(&#034;Shutting down scheduler&#8230;&#034;)<\/p>\n<p>        self.scheduler.shutdown()<br \/>\n        self.running &#061; False<\/p>\n<p>        if self.redis:<br \/>\n            await self.redis.close()<\/p>\n<p>        self.logger.info(&#034;Scheduler stopped&#034;)<\/p>\n<p>    async def run_once(self, crawler_name: str):<br \/>\n        &#034;&#034;&#034;<br \/>\n        \u7acb\u5373\u8fd0\u884c\u4e00\u6b21\u722c\u866b<\/p>\n<p>        Args:<br \/>\n            crawler_name: \u722c\u866b\u540d\u79f0<br \/>\n        &#034;&#034;&#034;<br \/>\n        await self.run_crawler(crawler_name) <\/p>\n<h4>6.2 Web\u76d1\u63a7\u754c\u9762<\/h4>\n<p>python<\/p>\n<p># src\/web\/dashboard.py<\/p>\n<p>from fastapi import FastAPI, WebSocket, WebSocketDisconnect<br \/>\nfrom fastapi.responses import HTMLResponse<br \/>\nfrom fastapi.staticfiles import StaticFiles<br \/>\nfrom fastapi.templating import Jinja2Templates<br \/>\nfrom starlette.requests import Request<br \/>\nimport plotly.graph_objects as go<br \/>\nimport plotly.express as px<br \/>\nfrom datetime import datetime, timedelta<br \/>\nimport pandas as pd<br \/>\nfrom typing import List, Dict<br \/>\nimport json<\/p>\n<p>from src.scheduler import CrawlerScheduler<br \/>\nfrom src.pipelines.storage import StoragePipeline<\/p>\n<p>app &#061; FastAPI(title&#061;&#034;Art Crawler Dashboard&#034;)<\/p>\n<p># \u6a21\u677f\u548c\u9759\u6001\u6587\u4ef6<br \/>\ntemplates &#061; Jinja2Templates(directory&#061;&#034;src\/web\/templates&#034;)<br \/>\napp.mount(&#034;\/static&#034;, StaticFiles(directory&#061;&#034;src\/web\/static&#034;), name&#061;&#034;static&#034;)<\/p>\n<p># \u5168\u5c40\u5b9e\u4f8b<br \/>\nscheduler &#061; CrawlerScheduler()<br \/>\nstorage &#061; StoragePipeline()<\/p>\n<p>class ConnectionManager:<br \/>\n    &#034;&#034;&#034;WebSocket\u8fde\u63a5\u7ba1\u7406\u5668&#034;&#034;&#034;<br \/>\n    def __init__(self):<br \/>\n        self.active_connections: List[WebSocket] &#061; []<\/p>\n<p>    async def connect(self, websocket: WebSocket):<br \/>\n        await websocket.accept()<br \/>\n        self.active_connections.append(websocket)<\/p>\n<p>    def disconnect(self, websocket: WebSocket):<br \/>\n        self.active_connections.remove(websocket)<\/p>\n<p>    async def broadcast(self, message: dict):<br \/>\n        for connection in self.active_connections:<br \/>\n            try:<br \/>\n                await connection.send_json(message)<br \/>\n            except:<br \/>\n                pass<\/p>\n<p>manager &#061; ConnectionManager()<\/p>\n<p>&#064;app.on_event(&#034;startup&#034;)<br \/>\nasync def startup_event():<br \/>\n    &#034;&#034;&#034;\u542f\u52a8\u4e8b\u4ef6&#034;&#034;&#034;<br \/>\n    await scheduler.initialize()<br \/>\n    await storage.initialize()<\/p>\n<p>&#064;app.on_event(&#034;shutdown&#034;)<br \/>\nasync def shutdown_event():<br \/>\n    &#034;&#034;&#034;\u5173\u95ed\u4e8b\u4ef6&#034;&#034;&#034;<br \/>\n    await storage.close()<\/p>\n<p>&#064;app.get(&#034;\/&#034;, response_class&#061;HTMLResponse)<br \/>\nasync def dashboard(request: Request):<br \/>\n    &#034;&#034;&#034;\u4eea\u8868\u76d8\u4e3b\u9875&#034;&#034;&#034;<br \/>\n    return templates.TemplateResponse(<br \/>\n        &#034;dashboard.html&#034;,<br \/>\n        {&#034;request&#034;: request}<br \/>\n    )<\/p>\n<p>&#064;app.get(&#034;\/api\/status&#034;)<br \/>\nasync def get_status():<br \/>\n    &#034;&#034;&#034;\u83b7\u53d6\u7cfb\u7edf\u72b6\u6001&#034;&#034;&#034;<br \/>\n    return await scheduler.get_status()<\/p>\n<p>&#064;app.get(&#034;\/api\/stats&#034;)<br \/>\nasync def get_stats(days: int &#061; 7):<br \/>\n    &#034;&#034;&#034;\u83b7\u53d6\u7edf\u8ba1\u6570\u636e&#034;&#034;&#034;<br \/>\n    end_date &#061; datetime.now()<br \/>\n    start_date &#061; end_date &#8211; timedelta(days&#061;days)<\/p>\n<p>    # \u4ece\u6570\u636e\u5e93\u83b7\u53d6\u7edf\u8ba1<br \/>\n    pipeline &#061; [<br \/>\n        {<br \/>\n            &#034;$match&#034;: {<br \/>\n                &#034;created_at&#034;: {<br \/>\n                    &#034;$gte&#034;: start_date,<br \/>\n                    &#034;$lte&#034;: end_date<br \/>\n                }<br \/>\n            }<br \/>\n        },<br \/>\n        {<br \/>\n            &#034;$group&#034;: {<br \/>\n                &#034;_id&#034;: {<br \/>\n                    &#034;year&#034;: {&#034;$year&#034;: &#034;$created_at&#034;},<br \/>\n                    &#034;month&#034;: {&#034;$month&#034;: &#034;$created_at&#034;},<br \/>\n                    &#034;day&#034;: {&#034;$dayOfMonth&#034;: &#034;$created_at&#034;}<br \/>\n                },<br \/>\n                &#034;count&#034;: {&#034;$sum&#034;: 1},<br \/>\n                &#034;source&#034;: {&#034;$first&#034;: &#034;$source&#034;}<br \/>\n            }<br \/>\n        },<br \/>\n        {&#034;$sort&#034;: {&#034;_id&#034;: 1}}<br \/>\n    ]<\/p>\n<p>    cursor &#061; storage.mongo_db.artworks.aggregate(pipeline)<br \/>\n    results &#061; await cursor.to_list(length&#061;None)<\/p>\n<p>    # \u8f6c\u6362\u4e3aDataFrame<br \/>\n    df &#061; pd.DataFrame(results)<\/p>\n<p>    # \u751f\u6210\u56fe\u8868<br \/>\n    charts &#061; {}<\/p>\n<p>    if not df.empty:<br \/>\n        # \u6bcf\u65e5\u91c7\u96c6\u6570\u91cf\u6298\u7ebf\u56fe<br \/>\n        fig_line &#061; go.Figure()<br \/>\n        for source in df[&#039;source&#039;].unique():<br \/>\n            source_data &#061; df[df[&#039;source&#039;] &#061;&#061; source]<br \/>\n            fig_line.add_trace(go.Scatter(<br \/>\n                x&#061;pd.to_datetime(source_data[&#039;_id&#039;].apply(<br \/>\n                    lambda x: f&#034;{x[&#039;year&#039;]}-{x[&#039;month&#039;]}-{x[&#039;day&#039;]}&#034;<br \/>\n                )),<br \/>\n                y&#061;source_data[&#039;count&#039;],<br \/>\n                mode&#061;&#039;lines&#043;markers&#039;,<br \/>\n                name&#061;source<br \/>\n            ))<br \/>\n        fig_line.update_layout(<br \/>\n            title&#061;&#034;\u6bcf\u65e5\u91c7\u96c6\u6570\u91cf\u8d8b\u52bf&#034;,<br \/>\n            xaxis_title&#061;&#034;\u65e5\u671f&#034;,<br \/>\n            yaxis_title&#061;&#034;\u6570\u91cf&#034;<br \/>\n        )<br \/>\n        charts[&#039;daily_trend&#039;] &#061; fig_line.to_json()<\/p>\n<p>        # \u6765\u6e90\u5206\u5e03\u997c\u56fe<br \/>\n        source_counts &#061; df.groupby(&#039;source&#039;)[&#039;count&#039;].sum().reset_index()<br \/>\n        fig_pie &#061; px.pie(<br \/>\n            source_counts,<br \/>\n            values&#061;&#039;count&#039;,<br \/>\n            names&#061;&#039;source&#039;,<br \/>\n            title&#061;&#034;\u6570\u636e\u6765\u6e90\u5206\u5e03&#034;<br \/>\n        )<br \/>\n        charts[&#039;source_distribution&#039;] &#061; fig_pie.to_json()<\/p>\n<p>        # \u827a\u672f\u5bb6Top 10<br \/>\n        pipeline_artists &#061; [<br \/>\n            {&#034;$group&#034;: {&#034;_id&#034;: &#034;$artist&#034;, &#034;count&#034;: {&#034;$sum&#034;: 1}}},<br \/>\n            {&#034;$sort&#034;: {&#034;count&#034;: -1}},<br \/>\n            {&#034;$limit&#034;: 10}<br \/>\n        ]<br \/>\n        top_artists &#061; await storage.mongo_db.artworks.aggregate(<br \/>\n            pipeline_artists<br \/>\n        ).to_list(length&#061;None)<\/p>\n<p>        if top_artists:<br \/>\n            fig_bar &#061; go.Figure(data&#061;[<br \/>\n                go.Bar(<br \/>\n                    x&#061;[a[&#039;_id&#039;] for a in top_artists],<br \/>\n                    y&#061;[a[&#039;count&#039;] for a in top_artists]<br \/>\n                )<br \/>\n            ])<br \/>\n            fig_bar.update_layout(<br \/>\n                title&#061;&#034;\u4f5c\u54c1\u6700\u591a\u7684\u827a\u672f\u5bb6Top 10&#034;,<br \/>\n                xaxis_title&#061;&#034;\u827a\u672f\u5bb6&#034;,<br \/>\n                yaxis_title&#061;&#034;\u4f5c\u54c1\u6570\u91cf&#034;<br \/>\n            )<br \/>\n            charts[&#039;top_artists&#039;] &#061; fig_bar.to_json()<\/p>\n<p>    # \u83b7\u53d6\u603b\u6570<br \/>\n    total_count &#061; await storage.mongo_db.artworks.count_documents({})<\/p>\n<p>    # \u83b7\u53d6\u6700\u65b0\u4f5c\u54c1<br \/>\n    latest &#061; await storage.mongo_db.artworks.find().sort(<br \/>\n        &#034;created_at&#034;, -1<br \/>\n    ).limit(10).to_list(length&#061;None)<\/p>\n<p>    return {<br \/>\n        &#034;total&#034;: total_count,<br \/>\n        &#034;charts&#034;: charts,<br \/>\n        &#034;latest&#034;: [<br \/>\n            {<br \/>\n                &#034;title&#034;: item.get(&#034;title&#034;),<br \/>\n                &#034;artist&#034;: item.get(&#034;artist&#034;),<br \/>\n                &#034;date&#034;: item.get(&#034;created_at&#034;).isoformat()<br \/>\n            }<br \/>\n            for item in latest<br \/>\n        ]<br \/>\n    }<\/p>\n<p>&#064;app.post(&#034;\/api\/crawlers\/{crawler_name}\/run&#034;)<br \/>\nasync def run_crawler(crawler_name: str):<br \/>\n    &#034;&#034;&#034;\u624b\u52a8\u8fd0\u884c\u722c\u866b&#034;&#034;&#034;<br \/>\n    asyncio.create_task(scheduler.run_once(crawler_name))<br \/>\n    return {&#034;status&#034;: &#034;started&#034;, &#034;crawler&#034;: crawler_name}<\/p>\n<p>&#064;app.post(&#034;\/api\/crawlers\/schedule&#034;)<br \/>\nasync def update_schedule(config: dict):<br \/>\n    &#034;&#034;&#034;\u66f4\u65b0\u8c03\u5ea6\u914d\u7f6e&#034;&#034;&#034;<br \/>\n    # \u66f4\u65b0\u8c03\u5ea6\u5668\u914d\u7f6e<br \/>\n    pass<\/p>\n<p>&#064;app.websocket(&#034;\/ws&#034;)<br \/>\nasync def websocket_endpoint(websocket: WebSocket):<br \/>\n    &#034;&#034;&#034;WebSocket\u5b9e\u65f6\u66f4\u65b0&#034;&#034;&#034;<br \/>\n    await manager.connect(websocket)<br \/>\n    try:<br \/>\n        while True:<br \/>\n            # \u6bcf\u79d2\u53d1\u9001\u4e00\u6b21\u72b6\u6001\u66f4\u65b0<br \/>\n            status &#061; await scheduler.get_status()<br \/>\n            await websocket.send_json(status)<br \/>\n            await asyncio.sleep(1)<br \/>\n    except WebSocketDisconnect:<br \/>\n        manager.disconnect(websocket)<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u4e00\u3001\u5f15\u8a00&#xff1a;\u4e3a\u4ec0\u4e48\u9700\u8981\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93&#xff1f;<br \/>\n\u5728\u6570\u5b57\u65f6\u4ee3&#xff0c;\u827a\u672f\u4f5c\u54c1\u7684\u6570\u5b57\u5316\u7ba1\u7406\u548c\u5206\u6790\u5df2\u6210\u4e3a\u827a\u672f\u754c\u3001\u5b66\u672f\u754c\u548c\u5546\u4e1a\u9886\u57df\u7684\u91cd\u8981\u9700\u6c42\u3002\u65e0\u8bba\u662f\u535a\u7269\u9986\u7684\u85cf\u54c1\u7ba1\u7406\u3001\u827a\u672f\u5e02\u573a\u7684\u4ef7\u683c\u5206\u6790&#xff0c;\u8fd8\u662fAI\u827a\u672f\u521b\u4f5c\u7684\u8bad\u7ec3\u6570\u636e&#xff0c;\u4e00\u4e2a\u7ed3\u6784\u5b8c\u5584\u7684\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93\u90fd\u626e\u6f14\u7740\u81f3\u5173\u91cd\u8981\u7684\u89d2\u8272\u3002<br \/>\n1.1 \u827a\u672f\u4f5c\u54c1\u6570\u636e\u7684\u4ef7\u503c<br \/>\n\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5305\u542b\u4e86\u4e30\u5bcc\u7684\u4fe1\u606f\u7ef4\u5ea6&#xff1a;\u521b\u4f5c\u80cc\u666f\u3001\u827a\u672f\u5bb6\u751f\u5e73\u3001\u6280\u6cd5\u6750\u6599\u3001\u5c3a\u5bf8\u89c4\u683c\u3001\u6536\u85cf\u5386\u53f2\u3001\u5e02\u573a\u4ef7\u503c\u7b49\u3002\u8fd9\u4e9b\u6570\u636e\u4e0d\u4ec5\u5bf9\u4e8e\u827a\u672f<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[81,3677,190,677,2653],"topic":[],"class_list":["post-76997","post","type-post","status-publish","format-standard","hentry","category-server","tag-python","tag-tensorflow","tag-190","tag-677","tag-2653"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v20.3 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/www.wsisp.com\/helps\/76997.html\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"og:description\" content=\"\u4e00\u3001\u5f15\u8a00&#xff1a;\u4e3a\u4ec0\u4e48\u9700\u8981\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93&#xff1f; \u5728\u6570\u5b57\u65f6\u4ee3&#xff0c;\u827a\u672f\u4f5c\u54c1\u7684\u6570\u5b57\u5316\u7ba1\u7406\u548c\u5206\u6790\u5df2\u6210\u4e3a\u827a\u672f\u754c\u3001\u5b66\u672f\u754c\u548c\u5546\u4e1a\u9886\u57df\u7684\u91cd\u8981\u9700\u6c42\u3002\u65e0\u8bba\u662f\u535a\u7269\u9986\u7684\u85cf\u54c1\u7ba1\u7406\u3001\u827a\u672f\u5e02\u573a\u7684\u4ef7\u683c\u5206\u6790&#xff0c;\u8fd8\u662fAI\u827a\u672f\u521b\u4f5c\u7684\u8bad\u7ec3\u6570\u636e&#xff0c;\u4e00\u4e2a\u7ed3\u6784\u5b8c\u5584\u7684\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93\u90fd\u626e\u6f14\u7740\u81f3\u5173\u91cd\u8981\u7684\u89d2\u8272\u3002 1.1 \u827a\u672f\u4f5c\u54c1\u6570\u636e\u7684\u4ef7\u503c \u827a\u672f\u4f5c\u54c1\u6570\u636e\u5305\u542b\u4e86\u4e30\u5bcc\u7684\u4fe1\u606f\u7ef4\u5ea6&#xff1a;\u521b\u4f5c\u80cc\u666f\u3001\u827a\u672f\u5bb6\u751f\u5e73\u3001\u6280\u6cd5\u6750\u6599\u3001\u5c3a\u5bf8\u89c4\u683c\u3001\u6536\u85cf\u5386\u53f2\u3001\u5e02\u573a\u4ef7\u503c\u7b49\u3002\u8fd9\u4e9b\u6570\u636e\u4e0d\u4ec5\u5bf9\u4e8e\u827a\u672f\" \/>\n<meta property=\"og:url\" content=\"https:\/\/www.wsisp.com\/helps\/76997.html\" \/>\n<meta property=\"og:site_name\" content=\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\" \/>\n<meta property=\"article:published_time\" content=\"2026-02-23T11:09:24+00:00\" \/>\n<meta name=\"author\" content=\"admin\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"admin\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"39 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/76997.html\",\"url\":\"https:\/\/www.wsisp.com\/helps\/76997.html\",\"name\":\"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"isPartOf\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\"},\"datePublished\":\"2026-02-23T11:09:24+00:00\",\"dateModified\":\"2026-02-23T11:09:24+00:00\",\"author\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\"},\"breadcrumb\":{\"@id\":\"https:\/\/www.wsisp.com\/helps\/76997.html#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/www.wsisp.com\/helps\/76997.html\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/76997.html#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/www.wsisp.com\/helps\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#website\",\"url\":\"https:\/\/www.wsisp.com\/helps\/\",\"name\":\"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3\",\"description\":\"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b\",\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}\"},\"query-input\":\"required name=search_term_string\"}],\"inLanguage\":\"zh-Hans\"},{\"@type\":\"Person\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41\",\"name\":\"admin\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"contentUrl\":\"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery\",\"caption\":\"admin\"},\"sameAs\":[\"http:\/\/wp.wsisp.com\"],\"url\":\"https:\/\/www.wsisp.com\/helps\/author\/admin\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/www.wsisp.com\/helps\/76997.html","og_locale":"zh_CN","og_type":"article","og_title":"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","og_description":"\u4e00\u3001\u5f15\u8a00&#xff1a;\u4e3a\u4ec0\u4e48\u9700\u8981\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93&#xff1f; \u5728\u6570\u5b57\u65f6\u4ee3&#xff0c;\u827a\u672f\u4f5c\u54c1\u7684\u6570\u5b57\u5316\u7ba1\u7406\u548c\u5206\u6790\u5df2\u6210\u4e3a\u827a\u672f\u754c\u3001\u5b66\u672f\u754c\u548c\u5546\u4e1a\u9886\u57df\u7684\u91cd\u8981\u9700\u6c42\u3002\u65e0\u8bba\u662f\u535a\u7269\u9986\u7684\u85cf\u54c1\u7ba1\u7406\u3001\u827a\u672f\u5e02\u573a\u7684\u4ef7\u683c\u5206\u6790&#xff0c;\u8fd8\u662fAI\u827a\u672f\u521b\u4f5c\u7684\u8bad\u7ec3\u6570\u636e&#xff0c;\u4e00\u4e2a\u7ed3\u6784\u5b8c\u5584\u7684\u827a\u672f\u4f5c\u54c1\u6570\u636e\u5e93\u90fd\u626e\u6f14\u7740\u81f3\u5173\u91cd\u8981\u7684\u89d2\u8272\u3002 1.1 \u827a\u672f\u4f5c\u54c1\u6570\u636e\u7684\u4ef7\u503c \u827a\u672f\u4f5c\u54c1\u6570\u636e\u5305\u542b\u4e86\u4e30\u5bcc\u7684\u4fe1\u606f\u7ef4\u5ea6&#xff1a;\u521b\u4f5c\u80cc\u666f\u3001\u827a\u672f\u5bb6\u751f\u5e73\u3001\u6280\u6cd5\u6750\u6599\u3001\u5c3a\u5bf8\u89c4\u683c\u3001\u6536\u85cf\u5386\u53f2\u3001\u5e02\u573a\u4ef7\u503c\u7b49\u3002\u8fd9\u4e9b\u6570\u636e\u4e0d\u4ec5\u5bf9\u4e8e\u827a\u672f","og_url":"https:\/\/www.wsisp.com\/helps\/76997.html","og_site_name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","article_published_time":"2026-02-23T11:09:24+00:00","author":"admin","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"admin","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"39 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/www.wsisp.com\/helps\/76997.html","url":"https:\/\/www.wsisp.com\/helps\/76997.html","name":"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357 - \u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","isPartOf":{"@id":"https:\/\/www.wsisp.com\/helps\/#website"},"datePublished":"2026-02-23T11:09:24+00:00","dateModified":"2026-02-23T11:09:24+00:00","author":{"@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41"},"breadcrumb":{"@id":"https:\/\/www.wsisp.com\/helps\/76997.html#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/www.wsisp.com\/helps\/76997.html"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/www.wsisp.com\/helps\/76997.html#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/www.wsisp.com\/helps"},{"@type":"ListItem","position":2,"name":"\u6784\u5efa\u5927\u89c4\u6a21\u827a\u672f\u4f5c\u54c1\u4fe1\u606f\u6570\u636e\u5e93\uff1a\u4ece\u96f6\u5f00\u59cb\u7684Python\u722c\u866b\u5b9e\u6218\u6307\u5357"}]},{"@type":"WebSite","@id":"https:\/\/www.wsisp.com\/helps\/#website","url":"https:\/\/www.wsisp.com\/helps\/","name":"\u7f51\u7855\u4e92\u8054\u5e2e\u52a9\u4e2d\u5fc3","description":"\u9999\u6e2f\u670d\u52a1\u5668_\u9999\u6e2f\u4e91\u670d\u52a1\u5668\u8d44\u8baf_\u670d\u52a1\u5668\u5e2e\u52a9\u6587\u6863_\u670d\u52a1\u5668\u6559\u7a0b","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/www.wsisp.com\/helps\/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"zh-Hans"},{"@type":"Person","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/358e386c577a3ab51c4493330a20ad41","name":"admin","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/www.wsisp.com\/helps\/#\/schema\/person\/image\/","url":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","contentUrl":"https:\/\/gravatar.wp-china-yes.net\/avatar\/?s=96&d=mystery","caption":"admin"},"sameAs":["http:\/\/wp.wsisp.com"],"url":"https:\/\/www.wsisp.com\/helps\/author\/admin"}]}},"_links":{"self":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/76997","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/comments?post=76997"}],"version-history":[{"count":0,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/posts\/76997\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/media?parent=76997"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/categories?post=76997"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/tags?post=76997"},{"taxonomy":"topic","embeddable":true,"href":"https:\/\/www.wsisp.com\/helps\/wp-json\/wp\/v2\/topic?post=76997"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}