Scrapy中的Request和Response

Request

Request 部分源码:

# 部分代码
class Request(object_ref): def init(self, url, callback=None, method=‘GET’, headers=None, body=None,

             cookies=None, meta=None, encoding=&#39;utf-8&#39;, priority=0,<br/>
             dont_filter=False, errback=None):

self._encoding = encoding # this one has to be set first

    self.method = str(method).upper()<br/>
    self._set_url(url)<br/>
    self._set_body(body)<br/>
    assert isinstance(priority, int), &#34;Request priority not an integer: %r&#34; % priority<br/>
    self.priority = priority

assert callback or not errback, “Cannot use errback without a callback”

    self.callback = callback<br/>
    self.errback = errback

self.cookies = cookies or {}

    self.headers = Headers(headers or {}, encoding=encoding)<br/>
    self.dont_filter = dont_filter

self._meta = dict(meta) if meta else None @property

def meta(self):<br/>
    if self._meta is None:<br/>
        self._meta = {}<br/>
    return self._meta</pre>

其中,比较常用的参数:

url: 就是需要请求,并进行下一步处理的url
callback: 指定该请求返回的Response,由那个函数来处理。
method: 请求一般不需要指定,默认GET方法,可设置为“GET”, “POST”, “PUT”等,且保证字符串大写
headers: 请求时,包含的头文件。一般不需要。内容一般如下:

    </code></pre>

    

    Host: media.readthedocs.org<br/>
    User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0<br/>
    Accept: text/css,*/*;q=0.1<br/>
    Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3<br/>
    Accept-Encoding: gzip, deflate<br/>
    Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/<br/>
    Cookie: _ga=GA1.2.1612165614.1415584110;<br/>
    Connection: keep-alive<br/>
    If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT<br/>
    Cache-Control: max-age=0</pre>


meta: 比较常用,在不同的请求之间传递数据使用的。字典dict型
   request_with_cookies = Request(

        url=&#34;http://www.example.com&#34;,<br/>
        cookies={&#39;currency&#39;: &#39;USD&#39;, &#39;country&#39;: &#39;UY&#39;},<br/>
        meta={&#39;dont_merge_cookies&#39;: True}<br/>
    )</pre>

encoding: 使用默认的 ‘utf-8’ 就行。
dont_filter: 表明该请求不由调度器过滤。这是当你想使用多次执行相同的请求,忽略重复的过滤器。默认为False。
errback: 指定错误处理函数
Response
# 部分代码
class Response(object_ref):

def __init__(self, url, status=200, headers=None, body=&#39;&#39;, flags=None, request=None):<br/>
    self.headers = Headers(headers or {})<br/>
    self.status = int(status)<br/>
    self._set_body(body)<br/>
    self._set_url(url)<br/>
    self.request = request<br/>
    self.flags = [] if flags is None else list(flags)

@property

def meta(self):<br/>
    try:<br/>
        return self.request.meta<br/>
    except AttributeError:<br/>
        raise AttributeError(&#34;Response.meta not available, this response &#34; \<br/>
            &#34;is not tied to any request&#34;)</pre>

大部分参数和上面的差不多:

status: 响应码
_set_body(body): 响应体
_set_url(url):响应url
self.request = request

发送POST请求

yield scrapy.FormRequest(url, formdata, callback)start_requests(self)
class mySpider(scrapy.Spider):

# start_urls = [&#34;http://www.example.com/&#34;]

def start_requests(self):

    url = &#39;http://www.renren.com/PLogin.do&#39;

FormRequest 是Scrapy发送POST请求的方法

    yield scrapy.FormRequest(<br/>
        url = url,<br/>
        formdata = {&#34;email&#34; : &#34;loaderman@163.com&#34;, &#34;password&#34; : &#34;loaderman&#34;},<br/>
        callback = self.parse_page<br/>
    )<br/>
def parse_page(self, response):<br/>
    # do something</pre>

模拟登陆

使用FormRequest.from_response()方法模拟用户登录

通常网站通过 实现对某些表单字段(如数据或是登录界面中的认证令牌等)的预填充。

使用Scrapy抓取网页时,如果想要预填充或重写像用户名、用户密码这些表单字段, 可以使用 FormRequest.from_response() 方法实现。

下面是使用这种方法的爬虫例子:

import scrapy
class LoginSpider(scrapy.Spider):

name = &#39;example.com&#39;<br/>
start_urls = [&#39;http://www.example.com/users/login.php&#39;]

def parse(self, response):

    return scrapy.FormRequest.from_response(<br/>
        response,<br/>
        formdata={&#39;username&#39;: &#39;john&#39;, &#39;password&#39;: &#39;secret&#39;},<br/>
        callback=self.after_login<br/>
    )

def after_login(self, response):

    # check login succeed before going on<br/>
    if &#34;authentication failed&#34; in response.body:<br/>
        self.log(&#34;Login failed&#34;, level=log.ERROR)<br/>
        return

continue scraping with authenticated session…

知乎爬虫案例参考:

zhihuSpider.py爬虫代码

# -- coding:utf-8 --
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy import Request, FormRequest
from zhihu.items import ZhihuItem class ZhihuSipder(CrawlSpider) :

name = &#34;zhihu&#34;<br/>
allowed_domains = [&#34;www.zhihu.com&#34;]<br/>
start_urls = [<br/>
    &#34;http://www.zhihu.com&#34;<br/>
]<br/>
rules = (<br/>
    Rule(LinkExtractor(allow = (&#39;/question/\d+#.*?&#39;, )), callback = &#39;parse_page&#39;, follow = True),<br/>
    Rule(LinkExtractor(allow = (&#39;/question/\d+&#39;, )), callback = &#39;parse_page&#39;, follow = True),<br/>
)

headers = {

&#34;Accept&#34;: &#34;*/*&#34;,<br/>
&#34;Accept-Encoding&#34;: &#34;gzip,deflate&#34;,<br/>
&#34;Accept-Language&#34;: &#34;en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4&#34;,<br/>
&#34;Connection&#34;: &#34;keep-alive&#34;,<br/>
&#34;Content-Type&#34;:&#34; application/x-www-form-urlencoded; charset=UTF-8&#34;,<br/>
&#34;User-Agent&#34;: &#34;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36&#34;,<br/>
&#34;Referer&#34;: &#34;http://www.zhihu.com/&#34;<br/>
}

#重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数

def start_requests(self):<br/>
    return [Request(&#34;https://www.zhihu.com/login&#34;, meta = {&#39;cookiejar&#39; : 1}, callback = self.post_login)]

def post_login(self, response):

    print &#39;Preparing login&#39;<br/>
    #下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单<br/>
    xsrf = Selector(response).xpath(&#39;//input[@name=&#34;_xsrf&#34;]/@value&#39;).extract()[0]<br/>
    print xsrf<br/>
    #FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单<br/>
    #登陆成功后, 会调用after_login回调函数<br/>
    return [FormRequest.from_response(response,   #&#34;http://www.zhihu.com/login&#34;,<br/>
                        meta = {&#39;cookiejar&#39; : response.meta[&#39;cookiejar&#39;]},<br/>
                        headers = self.headers,  #注意此处的headers<br/>
                        formdata = {<br/>
                        &#39;_xsrf&#39;: xsrf,<br/>
                        &#39;email&#39;: &#39;1095511864@qq.com&#39;,<br/>
                        &#39;password&#39;: &#39;&#39;<br/>
                        },<br/>
                        callback = self.after_login,<br/>
                        dont_filter = True<br/>
                        )]

def after_login(self, response) :

    for url in self.start_urls :<br/>
        yield self.make_requests_from_url(url)

def parse_page(self, response):

    problem = Selector(response)<br/>
    item = ZhihuItem()<br/>
    item[&#39;url&#39;] = response.url<br/>
    item[&#39;name&#39;] = problem.xpath(&#39;//span[@class=&#34;name&#34;]/text()&#39;).extract()<br/>
    print item[&#39;name&#39;]<br/>
    item[&#39;title&#39;] = problem.xpath(&#39;//h2[@class=&#34;zm-item-title zm-editable-content&#34;]/text()&#39;).extract()<br/>
    item[&#39;description&#39;] = problem.xpath(&#39;//div[@class=&#34;zm-editable-content&#34;]/text()&#39;).extract()<br/>
    item[&#39;answer&#39;]= problem.xpath(&#39;//div[@class=&#34; zm-editable-content clearfix&#34;]/text()&#39;).extract()<br/>
    return item</pre>

Item类设置

from scrapy.item import Item, Field
class ZhihuItem(Item):

# define the fields for your item here like:<br/>
# name = scrapy.Field()<br/>
url = Field()  #保存抓取问题的url<br/>
title = Field()  #抓取问题的标题<br/>
description = Field()  #抓取问题的描述<br/>
answer = Field()  #抓取问题的答案<br/>
name = Field()  #个人用户的名称</pre>

setting.py 设置抓取间隔

BOT_NAME = ‘zhihu’
SPIDER_MODULES = [‘zhihu.spiders’]
NEWSPIDER_MODULE = ‘zhihu.spiders’
DOWNLOAD_DELAY = 0.25 #设置下载间隔为250ms