首页 - 互联网

Scrapy爬虫大战京东商城

作者: 五速梦信息网
时间: 2026年06月03日 13:31

def parse_url(self,response):

    if response.status==200:   #判断是否请求成功<br/>
        # print response.url<br/>
        pids = set()    #这个集合用于过滤和保存得到的id,用于作为后面的ajax请求的url构成<br/>
        try:<br/>
            all_goods = response.xpath(&#34;//div[@id=&#39;J_goodsList&#39;]/ul/li&#34;)   #首先得到所有衣服的整个框架，然后从中抽取每一个框架

for goods in all_goods: #从中解析每一个

                # scrapy.shell.inspect_response(response,self)   #这是一个调试的方法，这里会直接打开调试模式<br/>
                items = JdSpiderItem()   #定义要抓取的数据<br/>
                img_url_src = goods.xpath(&#34;div/div[1]/a/img/@src&#34;).extract()  # 如果不存在就是一个空数组[]，因此不能在这里取[0]<br/>
                img_url_delay = goods.xpath(<br/>
                    &#34;div/div[1]/a/img/@data-lazy-img&#34;).extract()  # 这个是没有加载出来的图片，这里不能写上数组取第一个[0]<br/>
                price = goods.xpath(&#34;div/div[3]/strong/i/text()&#34;).extract()  #价格<br/>
                cloths_name = goods.xpath(&#34;div/div[4]/a/em/text()&#34;).extract()<br/>
                shop_id = goods.xpath(&#34;div/div[7]/@ data-shopid&#34;).extract()<br/>
                cloths_url = goods.xpath(&#34;div/div[1]/a/@href&#34;).extract()<br/>
                person_number = goods.xpath(&#34;div/div[5]/strong/a/text()&#34;).extract()<br/>
                pid = goods.xpath(&#34;@data-pid&#34;).extract()<br/>
                # product_id=goods.xpath(&#34;@data-sku&#34;).extract()<br/>
                if pid:<br/>
                    pids.add(pid[0])<br/>
                if img_url_src:  # 如果img_url_src存在<br/>
                    print img_url_src[0]<br/>
                    items[&#39;img_url&#39;] = img_url_src[0]<br/>
                if img_url_delay:  # 如果到了没有加载完成的图片，就取这个url<br/>
                    print img_url_delay[0]<br/>
                    items[&#39;img_url&#39;] = img_url_delay[0]  # 这里如果数组不是空的，就能写了<br/>
                if price:<br/>
                    items[&#39;price&#39;] = price[0]<br/>
                if cloths_name:<br/>
                    items[&#39;cloths_name&#39;] = cloths_name[0]<br/>
                if shop_id:<br/>
                    items[&#39;shop_id&#39;] = shop_id[0]<br/>
                    shop_url = &#34;https://mall.jd.com/index-&#34; + str(shop_id[0]) + &#34;.html&#34;<br/>
                    items[&#39;shop_url&#39;] = shop_url<br/>
                if cloths_url:<br/>
                    items[&#39;cloths_url&#39;] = cloths_url[0]<br/>
                if person_number:<br/>
                    items[&#39;person_number&#39;] = person_number[0]<br/>
                # if product_id:<br/>
                #     print &#34;************************************csdjkvjfskvnk***********************&#34;<br/>
                #     print self.comments_url.format(str(product_id[0]),str(self.count))<br/>
                #     yield scrapy.Request(url=self.comments_url.format(str(product_id[0]),str(self.count)),callback=self.comments)<br/>
                #yield scrapy.Request写在这里就是每解析一个键裤子就会调用回调函数一次<br/>
                yield items<br/>
        except Exception:<br/>
            print &#34;********************************************ERROR**********************************************************************&#34;

yield scrapy.Request(url=self.search_url.format(str(response.meta[‘search_page’]),“,”.join(pids)),callback=self.next_half_parse) #再次请求，这里是请求ajax加载的数据，必须放在这里，因为只有等到得到所有的pid才能构成这个请求，回调函数用于下面的解析