本文共 19956 字,大约阅读时间需要 66 分钟。
目录
1. items.py
class DouyuspiderItem(scrapy.Item): name = scrapy.Field()# 存储照片的名字 imagesUrls = scrapy.Field()# 照片的url路径 imagesPath = scrapy.Field()# 照片保存在本地的路径
2. spiders/douyu.py
import scrapyimport jsonfrom douyuSpider.items import DouyuspiderItemclass DouyuSpider(scrapy.Spider): name = "douyu" allowd_domains = ["http://capi.douyucdn.cn"] offset = 0 url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" start_urls = [url + str(offset)] def parse(self, response): # 返回从json里获取 data段数据集合 data = json.loads(response.text)["data"] for each in data: item = DouyuspiderItem() item["name"] = each["nickname"] item["imagesUrls"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
3. 设置setting.py
ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1}# Images 的存放位置,之后会在pipelines.py里调用IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images"# user-agentUSER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'
4. pipelines.py
import scrapyimport osfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.utils.project import get_project_settingsclass ImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get("IMAGES_STORE") def get_media_requests(self, item, info): image_url = item["imagesUrls"] yield scrapy.Request(image_url) def item_completed(self, results, item, info): # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确,就放到 image_path里,ImagesPipeline源码剖析可见 image_path = [x["path"] for ok, x in results if ok] os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg") item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"] return item#get_media_requests的作用就是为每一个图片链接生成一个Request对象,这个方法的输出将作为item_completed的输入中的results,results是一个元组,每个元组包括(success, imageinfoorfailure)。如果success=true,imageinfoor_failure是一个字典,包括url/path/checksum三个key。
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdlinecmdline.execute('scrapy crawl douyu'.split())
执行程序
py2 main.py
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。
import scrapyclass DongguanItem(scrapy.Item): # 每个帖子的标题 title = scrapy.Field() # 每个帖子的编号 number = scrapy.Field() # 每个帖子的文字内容 content = scrapy.Field() # 每个帖子的url url = scrapy.Field()
Spider 版本
# -*- coding: utf-8 -*-import scrapyfrom dongguan.items import DongguanItemclass SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=' offset = 0 start_urls = [url + str(offset)] def parse(self, response): # 取出每个页面里帖子链接列表 links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract() # 迭代发送每个帖子的请求,调用parse_item方法处理 for link in links: yield scrapy.Request(link, callback = self.parse_item) # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理 if self.offset <= 71130: self.offset += 30 yield scrapy.Request(self.url + str(self.offset), callback = self.parse) # 处理每个帖子里 def parse_item(self, response): item = DongguanItem() # 标题 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 文字内容,默认先取出有图片情况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有内容,则取出没有图片情况下的文字内容列表 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # content为列表,通过join方法拼接为字符串,并去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
CrawlSpider 版本
# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom dongguan.items import DongguanItemimport timeclass SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 每一页的匹配规则 pagelink = LinkExtractor(allow=('type=4')) # 每个帖子的匹配规则 contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml') rules = [ # 本案例为特殊情况,需要调用deal_links方法处理每个页面里的链接 Rule(pagelink, process_links = "deal_links", follow = True), Rule(contentlink, callback = 'parse_item') ] # 需要重新处理每个页面里的链接,将链接里的‘Type&type=4?page=xxx’替换为‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替换为‘Type?page=xxx&type=4’),否则无法发送这个链接 def deal_links(self, links): for link in links: link.url = link.url.replace("?","&").replace("Type&", "Type?") print link.url return links def parse_item(self, response): print response.url item = DongguanItem() # 标题 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 文字内容,默认先取出有图片情况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有内容,则取出没有图片情况下的文字内容列表 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # content为列表,通过join方法拼接为字符串,并去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
pipelines.py
# -*- coding: utf-8 -*-# 文件处理类库,可以指定编码格式import codecsimport jsonclass JsonWriterPipeline(object): def __init__(self): # 创建一个只写文件,指定文本编码格式为utf-8 self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + "\n" self.filename.write(content) return item def spider_closed(self, spider): self.file.close()
settings.py
ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300,}# 日志文件名和处理等级LOG_FILE = "dg.log"LOG_LEVEL = "DEBUG"
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdlinecmdline.execute('scrapy crawl sunwz'.split())
执行程序
py2 main.py
爬取新浪网导航页所有下所有大类、小类、小类里的子链接,以及子链接页面的新闻内容。
效果演示图:
items.py
import scrapyimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaItem(scrapy.Item): # 大类的标题 和 url parentTitle = scrapy.Field() parentUrls = scrapy.Field() # 小类的标题 和 子url subTitle = scrapy.Field() subUrls = scrapy.Field() # 小类目录存储路径 subFilename = scrapy.Field() # 小类下的子链接 sonUrls = scrapy.Field() # 文章标题和内容 head = scrapy.Field() content = scrapy.Field()
spiders/sina.py
# -*- coding: utf-8 -*-# -*- coding: utf-8 -*-from Sina.items import SinaItemimport scrapyimport osimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaSpider(scrapy.Spider): name= "sina" allowed_domains= ["sina.com.cn"] start_urls= [ "http://news.sina.com.cn/guide/" ] def parse(self, response): items= [] # 所有大类的url 和 标题 parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract() parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract() # 所有小类的ur 和 标题 subUrls = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract() subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract() #爬取所有大类 for i in range(0, len(parentTitle)): # 指定大类目录的路径和目录名 parentFilename = "./Data/" + parentTitle[i] #如果目录不存在,则创建目录 if(not os.path.exists(parentFilename)): os.makedirs(parentFilename) # 爬取所有小类 for j in range(0, len(subUrls)): item = SinaItem() # 保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_belong = subUrls[j].startswith(item['parentUrls']) # 如果属于本大类,将存储目录放在本大类目录下 if(if_belong): subFilename =parentFilename + '/'+ subTitle[j] # 如果目录不存在,则创建目录 if(not os.path.exists(subFilename)): os.makedirs(subFilename) # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] =subTitle[j] item['subFilename'] = subFilename items.append(item) #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 for item in items: yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse) #对于返回的小类的url,再进行递归请求 def second_parse(self, response): # 提取每次Response的meta数据 meta_1= response.meta['meta_1'] # 取出小类里所有子链接 sonUrls = response.xpath('//a/@href').extract() items= [] for i in range(0, len(sonUrls)): # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls']) # 如果属于本大类,获取字段值放在同一个item下便于传输 if(if_belong): item = SinaItem() item['parentTitle'] =meta_1['parentTitle'] item['parentUrls'] =meta_1['parentUrls'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) #发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理 for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse) # 数据解析方法,获取文章标题和内容 def detail_parse(self, response): item = response.meta['meta_2'] content = "" head = response.xpath('//h1[@id=\"main_title\"]/text()') content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract() # 将p标签里的文本内容合并到一起 for content_one in content_list: content += content_one item['head']= head item['content']= content yield item
pipelines.py
from scrapy import signalsimport sysreload(sys)sys.setdefaultencoding("utf-8")class SinaPipeline(object): def process_item(self, item, spider): sonUrls = item['sonUrls'] # 文件名为子链接url中间部分,并将 / 替换为 _,保存为 .txt格式 filename = sonUrls[7:-6].replace('/','_') filename += ".txt" fp = open(item['subFilename']+'/'+filename, 'w') fp.write(item['content']) fp.close() return item
settings.py
BOT_NAME = 'Sina'SPIDER_MODULES = ['Sina.spiders']NEWSPIDER_MODULE = 'Sina.spiders'ITEM_PIPELINES = { 'Sina.pipelines.SinaPipeline': 300,}LOG_LEVEL = 'DEBUG'
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdlinecmdline.execute('scrapy crawl sina'.split())
执行程序
py2 main.py
items.py
class CoserItem(scrapy.Item): url = scrapy.Field() name = scrapy.Field() info = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
# -*- coding: utf-8 -*-from scrapy.selector import Selectorimport scrapyfrom scrapy.contrib.loader import ItemLoaderfrom Cosplay.items import CoserItemclass CoserSpider(scrapy.Spider): name = "coser" allowed_domains = ["bcy.net"] start_urls = ( 'http://bcy.net/cn125101', 'http://bcy.net/cn126487', 'http://bcy.net/cn126173' ) def parse(self, response): sel = Selector(response) for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract(): link = 'http://bcy.net%s' % link request = scrapy.Request(link, callback=self.parse_item) yield request def parse_item(self, response): item = ItemLoader(item=CoserItem(), response=response) item.add_xpath('name', "//h1[@class='js-post-title']/text()") item.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = item.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] item.add_value('image_urls', urls) item.add_value('url', response.url) return item.load_item()
pipelines.py
import requestsfrom Cosplay import settingsimport osclass ImageDownloadPipeline(object): def process_item(self, item, spider): if 'image_urls' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['image_urls']: us = image_url.split('/')[3:] image_file_name = '_'.join(us) file_path = '%s/%s' % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['images'] = images return item
settings.py
ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}IMAGES_STORE = '../Images'DOWNLOAD_DELAY = 0.25 # 250 ms of delay
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdlinecmdline.execute('scrapy crawl coser'.split())
执行程序
py2 main.py
爬取豆瓣电影top250的电影数据,并保存在MongoDB中。
items.py
class DoubanspiderItem(scrapy.Item): # 电影标题 title = scrapy.Field() # 电影评分 score = scrapy.Field() # 电影信息 content = scrapy.Field() # 简介 info = scrapy.Field()
spiders/douban.py
import scrapyfrom doubanSpider.items import DoubanspiderItemclass DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] start = 0 url = 'https://movie.douban.com/top250?start=' end = '&filter=' start_urls = [url + str(start) + end] def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for each in movies: title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] # 以;作为分隔,将content列表里所有元素合并成一个新的字符串 item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] # 提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
pipelines.py
from scrapy.conf import settingsimport pymongoclass DoubanspiderPipeline(object): def __init__(self): # 获取setting主机名、端口号和数据库名 host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] # pymongo.MongoClient(host, port) 创建MongoDB链接 client = pymongo.MongoClient(host=host,port=port) # 指向指定的数据库 mdb = client[dbname] # 获取数据库里存放数据的表名 self.post = mdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): data = dict(item) # 向指定的表里添加数据 self.post.insert(data) return item
settings.py
BOT_NAME = 'doubanSpider'SPIDER_MODULES = ['doubanSpider.spiders']NEWSPIDER_MODULE = 'doubanSpider.spiders'ITEM_PIPELINES = { 'doubanSpider.pipelines.DoubanspiderPipeline' : 300 }# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'# MONGODB 主机环回地址127.0.0.1MONGODB_HOST = '127.0.0.1'# 端口号,默认是27017MONGODB_PORT = 27017# 设置数据库名称MONGODB_DBNAME = 'DouBan'# 存放本次数据的表名称MONGODB_DOCNAME = 'DouBanMovies'
运行
启动MongoDB数据库需要两个命令:mongod:是mongoDB数据库进程本身mongo:是命令行shell客户端sudo mongod # 首先启动数据库服务,再执行Scrapysudo mongo # 启动数据库shell在mongo shell下使用命令:# 查看当前数据库> db# 列出所有的数据库> show dbs# 连接DouBan数据库> use DouBan# 列出所有表> show collections# 查看表里的数据> db.DouBanMoives.find()
COOKIES_ENABLED
(Cookies中间件) 处于开启状态
COOKIES_ENABLED = True
或# COOKIES_ENABLED = False
策略一:直接POST数据(比如需要登陆的账户信息)
只要是需要提供post数据的,就可以用这种方法。下面示例里post的数据是账户密码:
# -*- coding: utf-8 -*-import scrapyclass Renren1Spider(scrapy.Spider): name = "renren1" allowed_domains = ["renren.com"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' # FormRequest 是Scrapy发送POST请求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page) def parse_page(self, response): with open("mao2.html", "w") as filename: filename.write(response.body)
策略二:标准的模拟登陆步骤
正统模拟登录方法:
首先发送登录页面的get请求,获取到页面里的登录必须的参数(比如说zhihu登陆界面的 _xsrf)
然后和账户密码一起post到服务器,登录成功
# -*- coding: utf-8 -*-import scrapyclass Renren2Spider(scrapy.Spider): name = "renren2" allowed_domains = ["renren.com"] start_urls = ( "http://www.renren.com/PLogin.do", ) # 处理start_urls里的登录url的响应内容,提取登陆需要的参数(如果需要的话) def parse(self, response): # 提取登陆需要的参数 #_xsrf = response.xpath("//_xsrf").extract()[0] # 发送请求参数,并调用指定回调函数处理 yield scrapy.FormRequest.from_response( response, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf}, callback = self.parse_page ) # 获取登录成功状态,访问需要登录后才能访问的页面 def parse_page(self, response): url = "http://www.renren.com/422167102/profile" yield scrapy.Request(url, callback = self.parse_newpage) # 处理响应内容 def parse_newpage(self, response): with open("xiao.html", "w") as filename: filename.write(response.body)
策略三:直接使用保存登陆状态的Cookie模拟登陆
如果实在没办法了,可以用这种方法模拟登录,虽然麻烦一点,但是成功率100%
# -*- coding: utf-8 -*-import scrapyclass RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = ( 'http://www.renren.com/111111', 'http://www.renren.com/222222', 'http://www.renren.com/333333', ) cookies = { "anonymid" : "ixrna3fysufnwv", "_r01_" : "1", "ap" : "327550029", "JSESSIONID" : "abciwg61A_RvtaRS3GjOv", "depovince" : "GW", "springskin" : "set", "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "t" : "691808127750a83d33704a565d8340ae9", "societyguester" : "691808127750a83d33704a565d8340ae9", "id" : "327550029", "xnsid" : "f42b25cf", "loginfrom" : "syshome" } # 可以重写Spider类的start_requests方法,附带Cookie值,发送POST请求 def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page) # 处理响应内容 def parse_page(self, response): print "===========" + response.url with open("deng.html", "w") as filename: filename.write(response.body)