{"id":3132,"date":"2019-04-22T20:44:00","date_gmt":"2019-04-22T12:44:00","guid":{"rendered":"https:\/\/shanlin.info\/?p=3132"},"modified":"2021-04-25T20:46:23","modified_gmt":"2021-04-25T12:46:23","slug":"%e4%bd%bf%e7%94%a8scrapy%e5%88%9b%e5%bb%ba%e7%88%ac%e8%99%ab%e5%92%8c%e5%b8%b8%e7%94%a8%e5%91%bd%e4%bb%a4","status":"publish","type":"post","link":"https:\/\/shanlin.info\/?p=3132","title":{"rendered":"\u4f7f\u7528Scrapy\u521b\u5efa\u722c\u866b\u548c\u5e38\u7528\u547d\u4ee4"},"content":{"rendered":"\n<p>\u672c\u6587\u4e3b\u8981\u8bb0\u5f55Scrapy\u7684\u5e38\u7528\u547d\u4ee4\uff0c\u7528\u4e8e\u5907\u5fd8\u3002\u9002\u7528\u4e8eWindows\u5e73\u53f0\u3002<br>\u4f8b\u5982\uff0c\u6211\u4eec\u8981\u722c\u53d6\u8fd9\u4e2a\u7f51\u7ad9\uff1a<a href=\"https:\/\/links.jianshu.com\/go?to=https%3A%2F%2Fwww.tudinet.com%2Fmarket-252-0-0-0%2F\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/www.tudinet.com\/market-252-0-0-0\/<\/a>&nbsp;\u91cd\u5e86\u5730\u533a\u7684\u571f\u5730\u8f6c\u8ba9\u4fe1\u606f\u3002<\/p>\n\n\n\n<p>\u6574\u4f53\u6d41\u7a0b\u5982\u4e0b\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>1\u3001\u4f7f\u7528scrapy startproject cq_land\u547d\u4ee4\u521b\u5efa\u9879\u76ee\n2\u3001\u4fee\u6539settings.py\uff0c\u4f7f\u722c\u866b\u751f\u6548\uff08ITEM_PIPELINES\u3001 USER_AGENT \u7b49\uff09\n3\u3001\u4fee\u6539items.py\uff0c\u7528\u4e8e\u5b58\u50a8\u722c\u53d6\u56de\u6765\u7684\u6570\u636e\n4\u3001\u4f7f\u7528scrapy genspider tudinet tudinet.com \u547d\u4ee4\uff0c\u521b\u5efa\u722c\u866b\u6587\u4ef6\uff0c\u7528\u4e8e\u722c\u53d6\u7f51\u9875\u5185\u5bb9\n5\u3001\u7f16\u5199\u4e0a\u4e00\u6b65\u751f\u6210\u7684tudinet.py\u722c\u866b\u6587\u4ef6\uff0c\u5b8c\u6210\u7f51\u9875\u5185\u5bb9\u89e3\u6790\n6\u3001\u4fee\u6539pipelines.py\u6587\u4ef6\uff0c\u5bf9\u83b7\u53d6\u5230\u7684\u4fe1\u606f\u8fdb\u884c\u6574\u7406\uff0c\u5b8c\u6210\u5b58\u50a8\n<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">1\u3001\u4e07\u4e8b\u7b2c\u4e00\u6b65\uff1a\u521b\u5efa\u5de5\u7a0b<\/h6>\n\n\n\n<p>\u9996\u5148\u5728cmd\u6216powershell\u7a97\u53e3\uff0cCD\u5230\u60f3\u8981\u521b\u5efa\u9879\u76ee\u7684\u76ee\u5f55\uff0c\u7136\u540e\u8f93\u5165\u4ee5\u4e0b\u547d\u4ee4\uff0c\u521b\u5efa\u4e86\u4e00\u4e2a\u540d\u4e3acq_land\u7684\u9879\u76ee\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>scrapy startproject cq_land\nPS C:\\WINDOWS\\system32&gt; e:\nPS E:\\&gt; cd E:\\web_data\nPS E:\\web_data&gt; scrapy startproject cq_land\nNew Scrapy project 'cq_land', using template directory 'c:\\\\programdata\\\\anaconda3\\\\lib\\\\site-packages\\\\scrapy\\\\templates\\\\project', created in:\n    E:\\web_data\\cq_land\n\nYou can start your first spider with:\n    cd cq_land\n    scrapy genspider example example.com\nPS E:\\web_data&gt;\n<\/code><\/pre>\n\n\n\n<p>\u8fd9\u6837\u5c31\u751f\u6210\u4e86\u4e00\u4e2acq_land\u7684\u6587\u4ef6\u76ee\u5f55\uff08\u5b8c\u6210\u540e\u5148\u4e0d\u8981\u5173\u95ed\u7ec8\u7aef\u7a97\u53e3\uff0c\u540e\u9762\u7b2c4\u6b65\u8fd8\u4f1a\u7528\u5230\uff09\u3002\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u4e3b\u8981\u9488\u5bf9items.py\u3001settings.py\u3001pipelines.py\u548cspiders\u6587\u4ef6\u5939\u8fdb\u884c\u4fee\u6539\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\">2\u3001\u4fee\u6539settings.py\uff0c\u4f7f\u722c\u866b\u751f\u6548<\/h6>\n\n\n\n<p>\u5c06settings.py\u4e2d\uff0cITEM_PIPELINES \u9644\u8fd1\u7684\u6ce8\u91ca\u53bb\u6389\uff0c\u4fee\u6539\u4e3a\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\u4fee\u6539\u524d\uff1a\n#ITEM_PIPELINES = {\n#    'cq_land.pipelines.CqLandPipeline': 300,\n#}\n\n\u4fee\u6539\u540e\uff1a\nITEM_PIPELINES = {\n    'cq_land.pipelines.CqLandPipeline': 300,\n}\n<\/code><\/pre>\n\n\n\n<p>\u6709\u4e9b\u7f51\u7ad9\u53ef\u80fd\u9700\u8981\u8bbe\u7f6eUSER_AGENT\uff0c\u6240\u4ee5\uff0c\u52a0\u4e0aUSER_AGENT\u9632\u6b62\u4e00\u822c\u7684\u7f51\u7ad9\u53cd\u722c\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>USER_AGENT = \"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/73.0.3683.86 Safari\/537.36\"\n<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">3\u3001\u4fee\u6539items.py\uff0c\u7528\u4e8e\u5b58\u50a8\u722c\u53d6\u56de\u6765\u7684\u6570\u636e<\/h6>\n\n\n\n<figure class=\"wp-block-gallery columns-1 is-cropped wp-block-gallery-1 is-layout-flex wp-block-gallery-is-layout-flex\"><ul class=\"blocks-gallery-grid\"><li class=\"blocks-gallery-item\"><figure><a href=\"https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5.png\"><img loading=\"lazy\" decoding=\"async\" width=\"907\" height=\"258\" src=\"https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5.png\" alt=\"\" data-id=\"3133\" data-full-url=\"https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5.png\" data-link=\"https:\/\/shanlin.info\/?attachment_id=3133\" class=\"wp-image-3133\" srcset=\"https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5.png 907w, https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5-300x85.png 300w, https:\/\/shanlin.info\/wp-content\/uploads\/2021\/04\/5-768x218.png 768w\" sizes=\"(max-width: 907px) 100vw, 907px\" \/><\/a><\/figure><\/li><\/ul><\/figure>\n\n\n\n<p>\u571f\u5730\u51fa\u8ba9\u4fe1\u606f<\/p>\n\n\n\n<p>\u89c2\u5bdf\u6570\u636e\u7ed3\u6784\uff0c\u4e3b\u8981\u4fe1\u606f\u6709\u5f88\u591a\u5b57\u6bb5\uff0c\u4ee5\u83b7\u53d6\u6807\u9898\u548c\u63a8\u51fa\u65f6\u95f4\u4e3a\u4f8b\uff0c\u5b9a\u4e49\u4e24\u4e2aitem\u9879\u76ee\uff08items\u91cc\u9762\u5b9a\u4e49\u7684\u5185\u5bb9\uff0c\u53ef\u4ee5\u7406\u89e3\u4e3a\u5b9a\u4e49\u4e86\u4e00\u4e2a\u547d\u540d\u4e3aitem\u7684\u5b57\u5178\uff0c\u6bcf\u4e2a\u5b9a\u4e49\u7684\u9879\u76ee\u6700\u4e3a\u952e\u503c\u5bf9\u5b58\u50a8\u5728item\u5b57\u5178\u4e2d\u2014\u2014\u952e\u503c\u5bf9\u5b58\u50a8\u7684\u5185\u5bb9\u53ef\u4ee5\u662f\u4efb\u4f55Python\u5bf9\u8c61[\u5e38\u7528\u7684\u5b57\u7b26\u4e32\u3001\u5217\u8868\u7b49]\uff09\uff0c\u4fee\u6539items.py\u6587\u4ef6\u5185\u5bb9\u5982\u4e0b\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class CqLandItem(scrapy.Item):\n    # define the fields for your item here like:\n    # name = scrapy.Field()\n    title = scrapy.Field() # \u51fa\u8ba9\u5730\u6807\u9898\n    list_time = scrapy.Field() # \u63a8\u51fa\u65f6\u95f4\n<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">4\u3001\u521b\u5efa\u722c\u866b\u6587\u4ef6\uff0c\u7528\u4e8e\u722c\u53d6\u7f51\u9875\u5185\u5bb9<\/h6>\n\n\n\n<p>\u8fd9\u65f6\u5019\u8fd4\u56de\u5230cmd\u6216powershell\u7ec8\u7aef\uff0ccd\u8fdb\u5165cq_land\u76ee\u5f55\uff0c\u7136\u540e\u521b\u5efa\u540d\u4e3atudinet\u7684\u722c\u866b\u6587\u4ef6\uff0c\u7528\u4e8e\u722c\u53d6\u571f\u6d41\u7f51\u7684\u6570\u636e\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>PS E:\\web_data&gt; cd cq_land\nPS E:\\web_data\\cq_land&gt; scrapy genspider tudinet tudinet.com\nCreated spider 'tudinet' using template 'basic' in module:\n  cq_land.spiders.tudinet\nPS E:\\web_data\\cq_land&gt;\n<\/code><\/pre>\n\n\n\n<p>\u5b8c\u6210\u540e\uff0c\u5c06\u5728spider\u6587\u4ef6\u5939\u4e0b\u4ea7\u751f\u4e00\u4e2a\u53ebtudinet.py\u7684\u6587\u4ef6\u3002\u8fd9\u4e2a\u6587\u4ef6\u5c31\u662f\u5b9a\u4e49\u722c\u866b\u600e\u4e48\u89e3\u6790\u7f51\u9875\u7684\u6587\u4ef6\uff0c\u89e3\u6790\u7684\u5185\u5bb9\u600e\u4e48\u5b58\u50a8\u5230\u521a\u624d\u5b9a\u4e49\u597d\u7684item\u4e2d\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\">5\u3001\u7f16\u5199tudinet.py\u6587\u4ef6\uff0c\u5b8c\u6210\u7f51\u9875\u5185\u5bb9\u89e3\u6790<\/h6>\n\n\n\n<p>\u9996\u5148\u9700\u8981\u5c06\u521a\u624d\u5b9a\u4e49\u597d\u7684item\u5185\u5bb9import\u8fdb\u6765\uff0c\u7136\u540e\u4fee\u6539start_urls\u4e3a\u6211\u4eec\u8981\u722c\u53d6\u7684\u7f51\u9875\uff08\u8fd9\u91cc\u6211\u4eec\u53ea\u722c\u53d6\u4e00\u4e2a\u7f51\u9875\u4f5c\u4e3a\u793a\u4f8b\uff09\u3002\u7136\u540e\u5728parse\u51fd\u6570\u4e2d\u5b9a\u4e49\u5904\u7406\u8fc7\u7a0b\uff0c\u5e76\u8fd4\u56de\u7ed3\u679c\uff0c\u4ee3\u7801\u5982\u4e0b\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import scrapy\nfrom cq_land.items import CqLandItem\n\nclass TudinetSpider(scrapy.Spider):\n    name = 'tudinet'\n    allowed_domains = &#91;'tudinet.com']\n    start_urls = &#91;'https:\/\/www.tudinet.com\/market-252-0-0-0']\n\n    def parse(self, response):\n        item=CqLandItem()\n        item&#91;'title']=response.xpath(\"\/\/div&#91;@class='land-l-bt']\/text()\").extract()\n        item&#91;'list_time']=response.xpath(\"\/\/div&#91;@class='land-l-cont']\/dl\/dd\/p&#91;1]\/text()\").extract()\n        \n        return item\n<\/code><\/pre>\n\n\n\n<p>\u4ee5\u4e0a\u662f\u9488\u5bf9\u5355\u4e2a\u7f51\u9875\uff0c\u4f46\u5b9e\u9645\u4e0a\u6211\u4eec\u722c\u866b\u591a\u534a\u662f\u9700\u8981\u9488\u5bf9\u6574\u4e2a\u7f51\u7ad9\u7684\u6240\u6709\u571f\u5730\u8f6c\u8ba9\u4fe1\u606f\u8fdb\u884c\u722c\u53d6\u7684\uff0c\u56e0\u6b64\uff0c\u6211\u4eec\u6839\u636e\u7f51\u7ad9\u7ffb\u9875\u7684\u53d8\u5316\uff0c\u4e00\u5171\u6709100\u9875\u53ef\u4ee5\u4f9b\u6211\u4eec\u722c\u53d6\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u53ef\u4ee5\u5728\u5f00\u5934\u5bf9start_urls\u8fdb\u884c\u91cd\u65b0\u5b9a\u4e49\uff0c\u7528\u4ee5\u722c\u53d6\u8fd9100\u4e2a\u9875\u9762\u3002<\/p>\n\n\n\n<p>\u8fd9\u91cc\u67092\u79cd\u5904\u7406\u65b9\u5f0f\uff0c\u7b2c\u4e00\u79cd\u5904\u7406\u65b9\u5f0f\uff0c\u5c31\u662f\u76f4\u63a5\u628a\u8fd9100\u4e2aurl\u4f5c\u4e3a\u5217\u8868\u653e\u5230start_urls \u4e2d\u3002\u5176\u4ed6\u7684\u5c31\u4e0d\u7528\u6539\u52a8\u4e86\u3002<\/p>\n\n\n\n<p>\u7b2c\u4e8c\u79cd\u5904\u7406\u65b9\u5f0f\uff0c\u5c31\u662f\u91cd\u65b0\u5b9a\u4e49start_requests\u51fd\u6570\u3002\u5b9e\u9645\u4e0a\u5b9a\u4e49\u8fd9\u4e2a\u51fd\u6570\u5c31\u662f\u628astart_urls\u5217\u8868\u91cc\u9762\u7684\u5730\u5740\u7528\u51fd\u6570\u751f\u6210\uff0c\u7136\u540e\u518d\u901a\u8fc7callback\u53c2\u6570\u8bbe\u7f6e\u56de\u8c03\u51fd\u6570\uff0c\u8ba9parse\u51fd\u6570\u6765\u5904\u7406Request\u4ea7\u751f\u7684\u7ed3\u679c\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import scrapy\nfrom cq_land.items import CqLandItem\nfrom scrapy.http import Request\n\nclass TudinetSpider(scrapy.Spider):\n    name = 'tudinet'\n    allowed_domains = &#91;'tudinet.com']\n    # start_urls = &#91;'https:\/\/www.tudinet.com\/market-252-0-0-0']\n    \n    def start_requests(self):\n        init_url='https:\/\/www.tudinet.com\/market-252-0-0-0\/list-pg'\n        for i in range(1,101):\n            yield Request(\"\".join(&#91;init_url,str(i),'.html']),callback=self.parse)\n\n    def parse(self, response):\n        item=CqLandItem()\n        item&#91;'title']=response.xpath(\"\/\/div&#91;@class='land-l-bt']\/text()\").extract()\n        item&#91;'list_time']=response.xpath(\"\/\/div&#91;@class='land-l-cont']\/dl\/dd\/p&#91;1]\/text()\").extract()\n        \n        return item\n<\/code><\/pre>\n\n\n\n<p>\u5b9e\u9645\u4e0a\uff0c\u5bf9\u4e8e\u6709\u591a\u4e2a\u5c42\u6b21\u7684\u7f51\u9875\uff0c\u4f8b\u5982\u67d0\u4e9b\u8bba\u575b\uff0c\u6709\u5f88\u591a\u6587\u7ae0\u5217\u8868\u3002\u6211\u4eec\u9996\u5148\u9700\u8981\u8bbf\u95ee\u9996\u9875\u83b7\u53d6\u9875\u9762\u603b\u6570\uff0c\u7136\u540e\u904d\u5386\u6240\u6709\u9875\u9762\u83b7\u53d6\u6bcf\u4e2a\u5e16\u5b50\u7684url\uff0c\u6700\u540e\u901a\u8fc7\u8bbf\u95ee\u6bcf\u4e2a\u5e16\u5b50\u7684\u5730\u5740\u83b7\u53d6\u6587\u7ae0\u7684\u8be6\u7ec6\u4fe1\u606f\u3002\u90a3\u4e48\u6211\u4eec\u7684\u722c\u866b\u6587\u4ef6\u7ed3\u6784\u5982\u57fa\u672c\u5982\u4e0b\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class abc_Spider(scrapy.Spider):\n    name='abc'\n    allowed+domains=&#91;'abc.com']\n    start_urls=&#91;'\u8bba\u575b\u7684\u9996\u9875']\n    # \u83b7\u53d6\u603b\u9875\u6570\n    def parse(self,response):\n        pages=response.xpath(\"\/\/xxxx\/\/\").extract()\n        # \u8fd9\u91cc\u7701\u7565\u4e86\u5c06pages\u7531\u5b57\u7b26\u8f6c\u6570\u5b57\u7684\u8fc7\u7a0b\n        for i in range(1,int(pages)):\n            yield Request(\"\".join(&#91;'xx.abc.com',str(i),\"xxx\"]),callback=self.get_detail_urls)\n    # \u83b7\u53d6\u6240\u6709\u6587\u7ae0\u8be6\u60c5\u9875\u7684\u9875\u9762url\n    def get_detail_urls(self,response):\n        detail_urls=response.xpath(\"\/\/xxxx\/\/\").extract()\n        for url in detail_urls:\n            yield Request(url,callback=self.parse_content)\n    # \u5bf9\u8be6\u60c5\u9875\u5185\u5bb9\u8fdb\u884c\u89e3\u6790\uff0c\u5e76\u8fd4\u56de\u7ed3\u679c\n    def parse_content(self,response):\n        item=abcitem()\n        item&#91;'xx']=response.xpath(\"\/\/xxxx\/\/\").extract()\n        return item\n<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">6\u3001\u4fee\u6539pipelines.py\uff0c\u5bf9\u83b7\u53d6\u5230\u7684\u4fe1\u606f\u8fdb\u884c\u6574\u7406\uff0c\u5b8c\u6210\u5b58\u50a8<\/h6>\n\n\n\n<p>\u5c06pipelines.py\u4fee\u6539\u4e3a\u5982\u4e0b\u5185\u5bb9\uff0c\u722c\u53d6\u7684\u5185\u5bb9\u5c06\u4f1a\u5b58\u50a8\u5230cq_land.csv\u6587\u4ef6\u4e2d\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import pandas as pd\n\nclass CqLandPipeline(object):        \n    def process_item(self, item, spider):\n        title=item&#91;'title']\n        list_time=item&#91;'list_time']\n        data=pd.DataFrame(&#91;title,list_time],index=&#91;'\u6807\u9898','\u63a8\u51fa\u65f6\u95f4']).T\n        data.to_csv('cq_land.csv',index=False,encoding='gb2312')\n        return item\n<\/code><\/pre>\n\n\n\n<p>\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c\u4e0a\u8ff0\u5b58\u50a8\u65b9\u5f0f\u9002\u7528\u4e8e\u5355\u4e2a\u7f51\u9875\u7684\u722c\u53d6\u3002\u5982\u679c\u662f\u591a\u4e2a\u7f51\u9875\uff0c\u9700\u8981\u5728data.to_csv\u4e2d\u6dfb\u52a0\u53c2\u6570\uff0cmode=&#8217;a&#8217;\uff0c\u8868\u793a\u4ee5\u8ffd\u52a0\u7684\u65b9\u5f0f\u6dfb\u52a0\u6570\u636e\uff0c\u540c\u65f6\u5e94\u6ce8\u610f\u6570\u636e\u7684\u5217\u6807\u9898\u95ee\u9898\u3002\u53e6\u5916\uff0c\u4e5f\u53ef\u4ee5\u4f7f\u7528\u6570\u636e\u5e93\u7b49\u65b9\u5f0f\u5728\u8fd9\u91cc\u5c06\u6570\u636e\u76f4\u63a5\u5b58\u50a8\u5230\u6570\u636e\u5e93\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\">7\u3001\u4f7f\u7528scrapy crawl tudinet \u8fd0\u884c\u722c\u866b<\/h6>\n\n\n\n<p>\u5b8c\u6210\u4e0a\u8ff0\u6587\u4ef6\u7f16\u8f91\u540e\uff0c\u8fd4\u56decmd\u6216powershell\u7ec8\u7aef\uff0c\u8fd0\u884c\uff1ascrapy crawl tudinet<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>PS E:\\web_data\\cq_land&gt; scrapy crawl tudinet\n2019-04-15 19:35:34 &#91;scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: cq_land)\n2019-04-15 19:35:34 &#91;scrapy.utils.log] INFO: Versions:\n\u2026\u2026\n\u2026\u2026\n\n<\/code><\/pre>\n\n\n\n<p>\u6ca1\u6709\u610f\u5916\u7684\u8bdd\uff0c\u4e0a\u8ff0\u4ee3\u7801\u4f1a\u4ea7\u751f\u4e00\u4e2acq_land.csv\u6587\u4ef6\uff0c\u6253\u5f00\u6587\u4ef6\u5982\u679c\u5185\u5bb9\u6b63\u786e\u5c31\u8bf4\u660e\u722c\u866b\u7f16\u5199\u6210\u529f\u4e86\u3002<br>\u5f53\u7136\uff0cscrapy crawl tudinet\u8fd8\u53ef\u4ee5\u5e26\u53c2\u6570\uff0c\u7528\u4e8e\u663e\u793a\u65e5\u5fd7\u7684\u7ea7\u522b\uff1a<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>CRITICAL - \u4e25\u91cd\u9519\u8bef(critical)\nERROR - \u4e00\u822c\u9519\u8bef(regular errors)\nWARNING - \u8b66\u544a\u4fe1\u606f(warning messages)\nINFO - \u4e00\u822c\u4fe1\u606f(informational messages)\nDEBUG - \u8c03\u8bd5\u4fe1\u606f(debugging messages)\n\n\u53ef\u4ee5\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u6309\u9700\u8981\u663e\u793a\u65e5\u5fd7\n# \u5b8c\u5168\u4e0d\u8f93\u51fa\u65e5\u5fd7\nscrapy crawl tudinet --nolog\n# \u6309\u9ed8\u8ba4\u8f93\u5165\u65e5\u5fd7\nscrapy crawl tudinet -L DEBUG\n<\/code><\/pre>\n\n\n\n<p>\u4ee5\u4e0a\u5c31\u662fscrapy\u722c\u866b\u7684\u57fa\u672c\u5f62\u5f0f\u3002<br>\u901a\u5e38\u60c5\u51b5\u4e0b\uff0c\u5bf9\u4e8e\u4e0d\u719f\u6089scrapy\u7684\u60c5\u5f62\u4e0b\uff0c\u53ef\u80fd\u5bf9scrapy\u4ea7\u751f\u7684\u5185\u5bb9\u4e0d\u4e86\u89e3\uff0c\u4e0d\u77e5\u9053\u54ea\u91cc\u51fa\u4e86\u95ee\u9898\uff0c\u8fd9\u91cc\u53ef\u80fd\u9700\u8981\u7528\u5230scrapy\u7684\u53e6\u4e00\u547d\u4ee4\uff0cscrapy shell url\u3002\u8fd9\u4e2a\u547d\u4ee4\u4f1a\u76f4\u63a5\u722c\u53d6url\u7684\u5185\u5bb9\uff0c\u7136\u540e\u5728\u7ec8\u7aef\u4e2d\uff0c\u901a\u8fc7ipython\u7ec8\u7aef\u7684\u65b9\u5f0f\u4ea7\u751f\u4ea4\u4e92\uff0c\u7528\u6237\u53ef\u4ee5\u4f7f\u7528response.xpath() \u7b49\u65b9\u6cd5\u6d4b\u8bd5\u8fd4\u56de\u7ed3\u679c\u3002\u5982response.url\u5c31\u662f\u8bf7\u6c42\u7684url\u3002<em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>PS E:\\web_data\\cq_land&gt; scrapy shell https:\/\/www.tudinet.com\/market-252-0-0-0\n2019-04-15 20:21:03 &#91;scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: cq_land)\n2019-04-15 20:21:03 &#91;scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) &#91;MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0\n\u2026\u2026\n\u2026\u2026\n&#91;s] Available Scrapy objects:\n&#91;s]   scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)\n&#91;s]   crawler    &lt;scrapy.crawler.Crawler object at 0x000001DAF8807358&gt;\n&#91;s]   item       {}\n&#91;s]   request    &lt;GET https:\/\/www.tudinet.com\/market-252-0-0-0&gt;\n&#91;s]   response   &lt;200 https:\/\/www.tudinet.com\/market-252-0-0-0&gt;\n&#91;s]   settings   &lt;scrapy.settings.Settings object at 0x000001DAF9B70898&gt;\n&#91;s]   spider     &lt;TudinetSpider 'tudinet' at 0x1daf9532f28&gt;\n&#91;s] Useful shortcuts:\n&#91;s]   fetch(url&#91;, redirect=True]) Fetch URL and update local objects (by default, redirects are followed)\n&#91;s]   fetch(req)                  Fetch a scrapy.Request and update local objects\n&#91;s]   shelp()           Shell help (print this help)\n&#91;s]   view(response)    View response in a browser\nIn &#91;1]:\n<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">\u9644\uff1ascrapy\u7684\u547d\u4ee4\u5217\u8868<\/h6>\n\n\n\n<p><em><\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\u5168\u5c40\u7684\u547d\u4ee4\u6709\uff1a\n\nstartproject \uff1a\u521b\u5efa\u9879\u76ee\uff08\u5e38\u7528\u5fc5\u987b\uff09\ngenspider \uff1a\u521b\u5efa\u722c\u866b\uff08\u5e38\u7528\u5fc5\u987b\uff09\nsettings \uff1a\u83b7\u53d6\u5f53\u524d\u7684\u914d\u7f6e\u4fe1\u606f\uff0c\u901a\u8fc7scrapy settings -h\u53ef\u4ee5\u83b7\u53d6\u8fd9\u4e2a\u547d\u4ee4\u7684\u6240\u6709\u5e2e\u52a9\u4fe1\u606f\nrunspider \uff1a\u672a\u521b\u5efa\u9879\u76ee\u7684\u60c5\u51b5\u4e0b\uff0c\u8fd0\u884c\u4e00\u4e2a\u7f16\u5199\u5728Python\u6587\u4ef6\u4e2d\u7684spider\nshell \uff1a \u5728\u7ec8\u7aef\u7a97\u53e3\u8bf7\u6c42\u4e00\u4e2a\u7f51\u5740\uff0c\u53ef\u7528\u4e8e\u63a2\u7d22\u722c\u53d6\u83b7\u5f97\u7684\u5185\u5bb9\uff08\u5e38\u7528\uff09\nfetch \uff1a\u8fc7scrapy downloader \u8bb2\u7f51\u9875\u7684\u6e90\u4ee3\u7801\u4e0b\u8f7d\u4e0b\u6765\u5e76\u663e\u793a\u51fa\u6765\nview \uff1a\u5c06\u7f51\u9875document\u5185\u5bb9\u4e0b\u8f7d\u4e0b\u6765\uff0c\u5e76\u4e14\u5728\u6d4f\u89c8\u5668\u663e\u793a\u51fa\u6765\nversion \uff1a\u67e5\u770b\u7248\u672c\u4fe1\u606f\uff0c\u5e76\u67e5\u770b\u4f9d\u8d56\u5e93\u7684\u4fe1\u606f\n\n\u9879\u76ee\u547d\u4ee4\u6709\uff1a\n\ncrawl \uff1a\u8fd0\u884c\u722c\u866b\uff08\u5e38\u7528\u5fc5\u987b\uff09\ncheck \uff1a \u68c0\u67e5\u4ee3\u7801\u662f\u5426\u6709\u9519\u8bef\nlist \uff1a\u5217\u51fa\u6240\u6709\u53ef\u7528\u722c\u866b\nedit \uff1aedit \u5728\u547d\u4ee4\u884c\u4e0b\u7f16\u8f91spider ### \u4e0d\u5efa\u8bae\u8fd0\u884c\nparse \nbench<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u672c\u6587\u4e3b\u8981\u8bb0\u5f55Scrapy\u7684\u5e38\u7528\u547d\u4ee4\uff0c\u7528\u4e8e\u5907\u5fd8\u3002\u9002\u7528\u4e8eWindows\u5e73\u53f0\u3002\u4f8b\u5982\uff0c\u6211\u4eec\u8981\u722c\u53d6\u8fd9\u4e2a\u7f51\u7ad9\uff1ahttps:<\/p><\/div>\n<div class=\"blog-btn\"><a href=\"https:\/\/shanlin.info\/?p=3132\" class=\"home-blog-btn\">\u9605\u8bfb\u66f4\u591a<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[3],"tags":[],"_links":{"self":[{"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/posts\/3132"}],"collection":[{"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/shanlin.info\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=3132"}],"version-history":[{"count":1,"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/posts\/3132\/revisions"}],"predecessor-version":[{"id":3134,"href":"https:\/\/shanlin.info\/index.php?rest_route=\/wp\/v2\/posts\/3132\/revisions\/3134"}],"wp:attachment":[{"href":"https:\/\/shanlin.info\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=3132"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/shanlin.info\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=3132"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/shanlin.info\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=3132"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}