博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python学习笔记——爬虫之Scrapy项目实战
阅读量:4218 次
发布时间:2019-05-26

本文共 19956 字,大约阅读时间需要 66 分钟。

目录


手机App抓包爬虫

1. items.py

class DouyuspiderItem(scrapy.Item):    name = scrapy.Field()# 存储照片的名字    imagesUrls = scrapy.Field()# 照片的url路径    imagesPath = scrapy.Field()# 照片保存在本地的路径

2. spiders/douyu.py

import scrapyimport jsonfrom douyuSpider.items import DouyuspiderItemclass DouyuSpider(scrapy.Spider):    name = "douyu"    allowd_domains = ["http://capi.douyucdn.cn"]    offset = 0    url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="    start_urls = [url + str(offset)]  def parse(self, response):      # 返回从json里获取 data段数据集合      data = json.loads(response.text)["data"]      for each in data:          item = DouyuspiderItem()          item["name"] = each["nickname"]          item["imagesUrls"] = each["vertical_src"]          yield item      self.offset += 20      yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

3. 设置setting.py

ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1}# Images 的存放位置,之后会在pipelines.py里调用IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images"# user-agentUSER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'

4. pipelines.py

import scrapyimport osfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.utils.project import get_project_settingsclass ImagesPipeline(ImagesPipeline):    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")    def get_media_requests(self, item, info):        image_url = item["imagesUrls"]        yield scrapy.Request(image_url)    def item_completed(self, results, item, info):        # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确,就放到 image_path里,ImagesPipeline源码剖析可见        image_path = [x["path"] for ok, x in results if ok]        os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg")        item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"]        return item#get_media_requests的作用就是为每一个图片链接生成一个Request对象,这个方法的输出将作为item_completed的输入中的results,results是一个元组,每个元组包括(success, imageinfoorfailure)。如果success=true,imageinfoor_failure是一个字典,包括url/path/checksum三个key。

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdlinecmdline.execute('scrapy crawl douyu'.split())

执行程序

py2 main.py

 

阳光热线问政平台

爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。

items.py

import scrapyclass DongguanItem(scrapy.Item):    # 每个帖子的标题    title = scrapy.Field()    # 每个帖子的编号    number = scrapy.Field()    # 每个帖子的文字内容    content = scrapy.Field()    # 每个帖子的url    url = scrapy.Field()

spiders/sunwz.py

Spider 版本

# -*- coding: utf-8 -*-import scrapyfrom dongguan.items import DongguanItemclass SunSpider(CrawlSpider):    name = 'sun'    allowed_domains = ['wz.sun0769.com']    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='    offset = 0    start_urls = [url + str(offset)]    def parse(self, response):        # 取出每个页面里帖子链接列表        links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract()        # 迭代发送每个帖子的请求,调用parse_item方法处理        for link in links:            yield scrapy.Request(link, callback = self.parse_item)        # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理        if self.offset <= 71130:            self.offset += 30            yield scrapy.Request(self.url + str(self.offset), callback = self.parse)    # 处理每个帖子里    def parse_item(self, response):        item = DongguanItem()        # 标题        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]        # 编号        item['number'] = item['title'].split(' ')[-1].split(":")[-1]        # 文字内容,默认先取出有图片情况下的文字内容列表        content = response.xpath('//div[@class="contentext"]/text()').extract()        # 如果没有内容,则取出没有图片情况下的文字内容列表        if len(content) == 0:            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()            # content为列表,通过join方法拼接为字符串,并去除首尾空格            item['content'] = "".join(content).strip()        else:            item['content'] = "".join(content).strip()        # 链接        item['url'] = response.url        yield item

CrawlSpider 版本

# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom dongguan.items import DongguanItemimport timeclass SunSpider(CrawlSpider):    name = 'sun'    allowed_domains = ['wz.sun0769.com']    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']    # 每一页的匹配规则    pagelink = LinkExtractor(allow=('type=4'))    # 每个帖子的匹配规则    contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml')    rules = [        # 本案例为特殊情况,需要调用deal_links方法处理每个页面里的链接        Rule(pagelink, process_links = "deal_links", follow = True),        Rule(contentlink, callback = 'parse_item')    ]    # 需要重新处理每个页面里的链接,将链接里的‘Type&type=4?page=xxx’替换为‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替换为‘Type?page=xxx&type=4’),否则无法发送这个链接    def deal_links(self, links):        for link in links:            link.url = link.url.replace("?","&").replace("Type&", "Type?")            print link.url        return links    def parse_item(self, response):        print response.url        item = DongguanItem()        # 标题        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]        # 编号        item['number'] = item['title'].split(' ')[-1].split(":")[-1]        # 文字内容,默认先取出有图片情况下的文字内容列表        content = response.xpath('//div[@class="contentext"]/text()').extract()        # 如果没有内容,则取出没有图片情况下的文字内容列表        if len(content) == 0:            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()            # content为列表,通过join方法拼接为字符串,并去除首尾空格            item['content'] = "".join(content).strip()        else:            item['content'] = "".join(content).strip()        # 链接        item['url'] = response.url        yield item

pipelines.py

# -*- coding: utf-8 -*-# 文件处理类库,可以指定编码格式import codecsimport jsonclass JsonWriterPipeline(object):    def __init__(self):        # 创建一个只写文件,指定文本编码格式为utf-8        self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8')    def process_item(self, item, spider):        content = json.dumps(dict(item), ensure_ascii=False) + "\n"        self.filename.write(content)        return item    def spider_closed(self, spider):        self.file.close()

settings.py

ITEM_PIPELINES = {    'dongguan.pipelines.DongguanPipeline': 300,}# 日志文件名和处理等级LOG_FILE = "dg.log"LOG_LEVEL = "DEBUG"

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdlinecmdline.execute('scrapy crawl sunwz'.split())

执行程序

py2 main.py

 

(实战项目三)新浪网分类资讯爬虫

爬取新浪网导航页所有下所有大类、小类、小类里的子链接,以及子链接页面的新闻内容。

效果演示图:

items.py

import scrapyimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaItem(scrapy.Item):    # 大类的标题 和 url    parentTitle = scrapy.Field()    parentUrls = scrapy.Field()    # 小类的标题 和 子url    subTitle = scrapy.Field()    subUrls = scrapy.Field()    # 小类目录存储路径    subFilename = scrapy.Field()    # 小类下的子链接    sonUrls = scrapy.Field()    # 文章标题和内容    head = scrapy.Field()    content = scrapy.Field()

spiders/sina.py

# -*- coding: utf-8 -*-# -*- coding: utf-8 -*-from Sina.items import SinaItemimport scrapyimport osimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaSpider(scrapy.Spider):    name= "sina"    allowed_domains= ["sina.com.cn"]    start_urls= [       "http://news.sina.com.cn/guide/"    ]    def parse(self, response):        items= []        # 所有大类的url 和 标题        parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()        parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract()        # 所有小类的ur 和 标题        subUrls  = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract()        subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract()        #爬取所有大类        for i in range(0, len(parentTitle)):            # 指定大类目录的路径和目录名            parentFilename = "./Data/" + parentTitle[i]            #如果目录不存在,则创建目录            if(not os.path.exists(parentFilename)):                os.makedirs(parentFilename)            # 爬取所有小类            for j in range(0, len(subUrls)):                item = SinaItem()                # 保存大类的title和urls                item['parentTitle'] = parentTitle[i]                item['parentUrls'] = parentUrls[i]                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)                if_belong = subUrls[j].startswith(item['parentUrls'])                # 如果属于本大类,将存储目录放在本大类目录下                if(if_belong):                    subFilename =parentFilename + '/'+ subTitle[j]                    # 如果目录不存在,则创建目录                    if(not os.path.exists(subFilename)):                        os.makedirs(subFilename)                    # 存储 小类url、title和filename字段数据                    item['subUrls'] = subUrls[j]                    item['subTitle'] =subTitle[j]                    item['subFilename'] = subFilename                    items.append(item)        #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理        for item in items:            yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)    #对于返回的小类的url,再进行递归请求    def second_parse(self, response):        # 提取每次Response的meta数据        meta_1= response.meta['meta_1']        # 取出小类里所有子链接        sonUrls = response.xpath('//a/@href').extract()        items= []        for i in range(0, len(sonUrls)):            # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True            if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])            # 如果属于本大类,获取字段值放在同一个item下便于传输            if(if_belong):                item = SinaItem()                item['parentTitle'] =meta_1['parentTitle']                item['parentUrls'] =meta_1['parentUrls']                item['subUrls'] = meta_1['subUrls']                item['subTitle'] = meta_1['subTitle']                item['subFilename'] = meta_1['subFilename']                item['sonUrls'] = sonUrls[i]                items.append(item)        #发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理        for item in items:                yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse)    # 数据解析方法,获取文章标题和内容    def detail_parse(self, response):        item = response.meta['meta_2']        content = ""        head = response.xpath('//h1[@id=\"main_title\"]/text()')        content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract()        # 将p标签里的文本内容合并到一起        for content_one in content_list:            content += content_one        item['head']= head        item['content']= content        yield item

pipelines.py

from scrapy import signalsimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaPipeline(object):    def process_item(self, item, spider):        sonUrls = item['sonUrls']        # 文件名为子链接url中间部分,并将 / 替换为 _,保存为 .txt格式        filename = sonUrls[7:-6].replace('/','_')        filename += ".txt"        fp = open(item['subFilename']+'/'+filename, 'w')        fp.write(item['content'])        fp.close()        return item

settings.py

BOT_NAME = 'Sina'SPIDER_MODULES = ['Sina.spiders']NEWSPIDER_MODULE = 'Sina.spiders'ITEM_PIPELINES = {    'Sina.pipelines.SinaPipeline': 300,}LOG_LEVEL = 'DEBUG'

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdlinecmdline.execute('scrapy crawl sina'.split())

执行程序

py2 main.py

Cosplay图片下载爬虫

items.py

class CoserItem(scrapy.Item):    url = scrapy.Field()    name = scrapy.Field()    info = scrapy.Field()    image_urls = scrapy.Field()    images = scrapy.Field()

spiders/coser.py

# -*- coding: utf-8 -*-from scrapy.selector import Selectorimport scrapyfrom scrapy.contrib.loader import ItemLoaderfrom Cosplay.items import CoserItemclass CoserSpider(scrapy.Spider):    name = "coser"    allowed_domains = ["bcy.net"]    start_urls = (        'http://bcy.net/cn125101',        'http://bcy.net/cn126487',        'http://bcy.net/cn126173'    )    def parse(self, response):        sel = Selector(response)        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():            link = 'http://bcy.net%s' % link            request = scrapy.Request(link, callback=self.parse_item)            yield request    def parse_item(self, response):        item = ItemLoader(item=CoserItem(), response=response)        item.add_xpath('name', "//h1[@class='js-post-title']/text()")        item.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")        urls = item.get_xpath('//img[@class="detail_std detail_clickable"]/@src')        urls = [url.replace('/w650', '') for url in urls]        item.add_value('image_urls', urls)        item.add_value('url', response.url)        return item.load_item()

pipelines.py

import requestsfrom Cosplay import settingsimport osclass ImageDownloadPipeline(object):    def process_item(self, item, spider):        if 'image_urls' in item:            images = []            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)            if not os.path.exists(dir_path):                os.makedirs(dir_path)            for image_url in item['image_urls']:                us = image_url.split('/')[3:]                image_file_name = '_'.join(us)                file_path = '%s/%s' % (dir_path, image_file_name)                images.append(file_path)                if os.path.exists(file_path):                    continue                with open(file_path, 'wb') as handle:                    response = requests.get(image_url, stream=True)                    for block in response.iter_content(1024):                        if not block:                            break                        handle.write(block)            item['images'] = images        return item

settings.py

ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}IMAGES_STORE = '../Images'DOWNLOAD_DELAY = 0.25    # 250 ms of delay

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdlinecmdline.execute('scrapy crawl coser'.split())

执行程序

py2 main.py

用Pymongo保存数据

爬取豆瓣电影top250的电影数据,并保存在MongoDB中。

items.py

class DoubanspiderItem(scrapy.Item):    # 电影标题    title = scrapy.Field()    # 电影评分    score = scrapy.Field()    # 电影信息    content = scrapy.Field()    # 简介    info = scrapy.Field()

spiders/douban.py

import scrapyfrom doubanSpider.items import DoubanspiderItemclass DoubanSpider(scrapy.Spider):    name = "douban"    allowed_domains = ["movie.douban.com"]    start = 0    url = 'https://movie.douban.com/top250?start='    end = '&filter='    start_urls = [url + str(start) + end]    def parse(self, response):        item = DoubanspiderItem()        movies = response.xpath("//div[@class=\'info\']")        for each in movies:            title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()            content = each.xpath('div[@class="bd"]/p/text()').extract()            score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()            info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()            item['title'] = title[0]            # 以;作为分隔,将content列表里所有元素合并成一个新的字符串            item['content'] = ';'.join(content)            item['score'] = score[0]            item['info'] = info[0]            # 提交item            yield item        if self.start <= 225:            self.start += 25            yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)

pipelines.py

from scrapy.conf import settingsimport pymongoclass DoubanspiderPipeline(object):    def __init__(self):        # 获取setting主机名、端口号和数据库名        host = settings['MONGODB_HOST']        port = settings['MONGODB_PORT']        dbname = settings['MONGODB_DBNAME']        # pymongo.MongoClient(host, port) 创建MongoDB链接        client = pymongo.MongoClient(host=host,port=port)        # 指向指定的数据库        mdb = client[dbname]        # 获取数据库里存放数据的表名        self.post = mdb[settings['MONGODB_DOCNAME']]    def process_item(self, item, spider):        data = dict(item)        # 向指定的表里添加数据        self.post.insert(data)        return item

settings.py

BOT_NAME = 'doubanSpider'SPIDER_MODULES = ['doubanSpider.spiders']NEWSPIDER_MODULE = 'doubanSpider.spiders'ITEM_PIPELINES = {        'doubanSpider.pipelines.DoubanspiderPipeline' : 300        }# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'# MONGODB 主机环回地址127.0.0.1MONGODB_HOST = '127.0.0.1'# 端口号,默认是27017MONGODB_PORT = 27017# 设置数据库名称MONGODB_DBNAME = 'DouBan'# 存放本次数据的表名称MONGODB_DOCNAME = 'DouBanMovies'

运行

启动MongoDB数据库需要两个命令:mongod:是mongoDB数据库进程本身mongo:是命令行shell客户端sudo mongod # 首先启动数据库服务,再执行Scrapysudo mongo # 启动数据库shell在mongo shell下使用命令:# 查看当前数据库> db# 列出所有的数据库> show dbs# 连接DouBan数据库> use DouBan# 列出所有表> show collections# 查看表里的数据> db.DouBanMoives.find()

三种Scrapy模拟登陆策略

注意:模拟登陆时,必须保证settings.py里的 COOKIES_ENABLED(Cookies中间件) 处于开启状态

COOKIES_ENABLED = True 或 # COOKIES_ENABLED = False

策略一:直接POST数据(比如需要登陆的账户信息)

只要是需要提供post数据的,就可以用这种方法。下面示例里post的数据是账户密码:

# -*- coding: utf-8 -*-import scrapyclass Renren1Spider(scrapy.Spider):    name = "renren1"    allowed_domains = ["renren.com"]    def start_requests(self):        url = 'http://www.renren.com/PLogin.do'        # FormRequest 是Scrapy发送POST请求的方法        yield scrapy.FormRequest(                url = url,                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},                callback = self.parse_page)    def parse_page(self, response):        with open("mao2.html", "w") as filename:            filename.write(response.body)

策略二:标准的模拟登陆步骤

正统模拟登录方法:

  1. 首先发送登录页面的get请求,获取到页面里的登录必须的参数(比如说zhihu登陆界面的 _xsrf)

  2. 然后和账户密码一起post到服务器,登录成功

# -*- coding: utf-8 -*-import scrapyclass Renren2Spider(scrapy.Spider):    name = "renren2"    allowed_domains = ["renren.com"]    start_urls = (        "http://www.renren.com/PLogin.do",    )    # 处理start_urls里的登录url的响应内容,提取登陆需要的参数(如果需要的话)    def parse(self, response):        # 提取登陆需要的参数        #_xsrf = response.xpath("//_xsrf").extract()[0]        # 发送请求参数,并调用指定回调函数处理        yield scrapy.FormRequest.from_response(                response,                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf},                callback = self.parse_page            )    # 获取登录成功状态,访问需要登录后才能访问的页面    def parse_page(self, response):        url = "http://www.renren.com/422167102/profile"        yield scrapy.Request(url, callback = self.parse_newpage)    # 处理响应内容    def parse_newpage(self, response):        with open("xiao.html", "w") as filename:            filename.write(response.body)

策略三:直接使用保存登陆状态的Cookie模拟登陆

如果实在没办法了,可以用这种方法模拟登录,虽然麻烦一点,但是成功率100%

# -*- coding: utf-8 -*-import scrapyclass RenrenSpider(scrapy.Spider):    name = "renren"    allowed_domains = ["renren.com"]    start_urls = (        'http://www.renren.com/111111',        'http://www.renren.com/222222',        'http://www.renren.com/333333',    )    cookies = {    "anonymid" : "ixrna3fysufnwv",    "_r01_" : "1",    "ap" : "327550029",    "JSESSIONID" : "abciwg61A_RvtaRS3GjOv",    "depovince" : "GW",    "springskin" : "set",    "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950",    "t" : "691808127750a83d33704a565d8340ae9",    "societyguester" : "691808127750a83d33704a565d8340ae9",    "id" : "327550029",    "xnsid" : "f42b25cf",    "loginfrom" : "syshome"    }    # 可以重写Spider类的start_requests方法,附带Cookie值,发送POST请求    def start_requests(self):        for url in self.start_urls:            yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page)    # 处理响应内容    def parse_page(self, response):        print "===========" + response.url        with open("deng.html", "w") as filename:            filename.write(response.body)

 

你可能感兴趣的文章
HashMap和Hashtable的区别
查看>>
JVM 对 Java 的原生锁做了哪些优化?
查看>>
JAVA实现简单的阻塞队列
查看>>
我的2020
查看>>
idea快捷键使用
查看>>
2.1MAC协议概述
查看>>
2.3 WSN的MAC协议
查看>>
图解后缀表达式的计算过程
查看>>
栈与队列的应用——计算表达式的值
查看>>
静态链表——sharing
查看>>
静态链表——sorting
查看>>
DFS——背包问题
查看>>
DFS——选数问题
查看>>
BFS——求矩阵中“块”的个数
查看>>
BFS——走迷宫的最小步数
查看>>
并查集——好朋友
查看>>
关键路径
查看>>
Web前端学习笔记——JavaScript之事件详解
查看>>
Web前端学习笔记——JavaScript之事件、创建元素、节点操作
查看>>
Web前端学习笔记——JavaScript之正则表达式、伪数组、垃圾回收
查看>>