博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
简书全站爬取 mysql异步保存
阅读量:5117 次
发布时间:2019-06-13

本文共 8117 字,大约阅读时间需要 27 分钟。

# 简书网# 数据保存在mysql中; 将selenium+chromedriver集成到scrapy; 整个网站数据爬取#  抓取ajax数据#爬虫文件# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom jianshu_spider.items import ArticleItemclass JsSpider(CrawlSpider):    name = 'js'    allowed_domains = ['jianshu.com']    start_urls = ['https://www.jianshu.com/'] # 从首页开始爬去    rules = (        # 详情页里面下面推荐的文章的href直接就是/p/.......        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),             callback='parse_detail', follow=True),    )    def parse_detail(self, response):        # print(response.text)        title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()        # print(title)        avatar = response.xpath("//a[@class='avatar']/img/@src").get()        # print(avatar)        author = response.xpath("//span[@class='name']/a/text()").get()        # print(author)        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")        # print(pub_time)        # url正常情况下里面只有一个?        url = response.url        url1 = url.split("?")[0]        article_id = url1.split("/")[-1]        # print(article_id)        # 把html标签一起趴下来, 方便以后展示        content = response.xpath("//div[@class='show-content']").get()        # print(content)        item = ArticleItem(            title=title,            avatar=avatar,            author=author,            pub_time=pub_time,            origin_url=response.url,            article_id=article_id,            content=content        )        yield item# item文件import scrapyclass ArticleItem(scrapy.Item):    # define the fields for your item here like:    title = scrapy.Field()    content = scrapy.Field()    article_id = scrapy.Field()    origin_url = scrapy.Field()    author = scrapy.Field()    avatar = scrapy.Field()    pub_time = scrapy.Field()        # pipeline文件  保存在mysql中import pymysqlfrom twisted.enterprise import adbapi       # 专门做数据库处理的模块from pymysql import cursorsclass JianshuSpiderPipeline(object):    def __init__(self):        dbparams={            'host':'127.0.0.1',            'port':3306,            'user':'root',            'password':'',            'database':'jianshu',            'charset':'utf8'        }        self.conn = pymysql.connect(**dbparams)        # **dbparams 相当于把 host='127.0.0.1' 写在了括号里        self.cursor = self.conn.cursor()        self._sql = None    def process_item(self, item, spider):        self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],                                      item['pub_time'],item['origin_url'],item['article_id']))        self.conn.commit() # 这个是同步进行的 比较慢        return item    @property    def sql(self):        if not self._sql: # 如果没有 执行            self._sql = '''            insert into article2(id,title,content,author,avatar,pub_time,            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)            '''            return self._sql        else:            return self._sql

 

# 优化上面的pipeline文件,  实现异步保存# 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说)# 上面的存储是同步 比较慢, 现在优化成异步class JianshuTwistedPipeline(object):    def __init__(self):        # 创建连接池        dbparams = {            'host': '127.0.0.1',            'port': 3306,            'user': 'root',            'password': '',            'database': 'jianshu',            'charset': 'utf8',            'cursorclass':cursors.DictCursor        }        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)        self._sql = None    @property    def sql(self):        if not self._sql: # 如果没有 执行            self._sql = '''            insert into article2(id,title,content,author,avatar,pub_time,            origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)            '''            return self._sql        else:            return self._sql    def process_item(self,item,spider):        # runInteraction执行异步的        defer = self.dbpool.runInteraction(self.insert_item,item)        defer.addErrback(self.handle_error,item,spider)    def insert_item(self,cursor,item): # 插入数据库        cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],                                      item['pub_time'],item['origin_url'],item['article_id']))    def handle_error(self,error,item,spider):        print('='*20)        print("error:",error)        print('='*20)# 把settings中的pipeline文件改一下ITEM_PIPELINES = {   # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,   'jianshu_spider.pipelines.JianshuTwistedPipeline': 300, # 异步保存数据}

 

# 优化动态数据     处理ajax加载进来的数据# selenium+chromdriver 处理# 爬虫文件  把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中    def parse_detail(self, response):        # print(response.text)        title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()        print(title)        avatar = response.xpath("//a[@class='avatar']/img/@src").get()        # print(avatar)        author = response.xpath("//span[@class='name']/a/text()").get()        # print(author)        pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")        # print(pub_time)        # url正常情况下里面只有一个?        url = response.url        url1 = url.split("?")[0]        article_id = url1.split("/")[-1]        # print(article_id)        # 把html标签一起趴下来, 方便以后展示        content = response.xpath("//div[@class='show-content']").get()        # print(content)        # 动态获取下面的数据        word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]        read_count = response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]        comment_count = response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]        like_count = response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]        subject = response.xpath("//div[@class='include-collection']/a/div/text()").getall()        # subject 获取的时候一个列表  存到mysql的时候不支持, 需要把列表转成字符串        subject = ",".join(subject)        item = ArticleItem(            title=title,            avatar=avatar,            author=author,            pub_time=pub_time,            origin_url=response.url,            article_id=article_id,            content=content,                        word_count=word_count,            read_count=read_count,            comment_count=comment_count,            like_count=like_count,            subject=subject,        )        yield item# 管道文件# 上面的存储是同步 比较慢, 现在优化成异步class JianshuTwistedPipeline(object):    def __init__(self):        # 创建连接池        dbparams = {            'host': '127.0.0.1',            'port': 3306,            'user': 'root',            'password': '',            'database': 'jianshu',            'charset': 'utf8',            'cursorclass':cursors.DictCursor        }        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)        self._sql = None    @property    def sql(self):        if not self._sql: # 如果没有 执行            self._sql = '''            insert into article2(id,title,content,author,avatar,pub_time,            origin_url,article_id,read_count, word_count, like_count, comment_count,subject)             values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)            '''            #            return self._sql        else:            return self._sql    def process_item(self,item,spider):        # runInteraction执行异步的        defer = self.dbpool.runInteraction(self.insert_item,item)        defer.addErrback(self.handle_error,item,spider)    def insert_item(self,cursor,item): # 插入数据库        cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],                                      item['pub_time'],item['origin_url'],item['article_id'],                                 item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))    def handle_error(self,error,item,spider):        print('='*20+'error'+'='*20)        print("error:",error)        print('='*20+'error'+'='*20)

 

转载于:https://www.cnblogs.com/kenD/p/11123696.html

你可能感兴趣的文章
Linux服务器开启tomcat的gc日志
查看>>
PCL—点云滤波(基于点云频率) 低层次点云处理
查看>>
Flask框架视图多层装饰器问题
查看>>
VS2005新建应用工程出错解决方法
查看>>
自制Informatica教程
查看>>
Day03 单行函数
查看>>
MongoDB 安全配置
查看>>
【接口】【USB】1.学习笔记
查看>>
dubbo的简单实现
查看>>
在学习oauth时学到的
查看>>
hdu2602Bone Collector(01背包)
查看>>
Temporary failure in name resolutionf的解决方法
查看>>
在jinja2的页面中使用javascript对页面元素进行删除
查看>>
hibernate save,update,saveorupdate方法有什么区别
查看>>
jqueryUI小案例
查看>>
Git的入门
查看>>
bzoj2720: [Violet 5]列队春游(概率期望+组合数学)
查看>>
Javascript 综合示例 网页扫雷游戏
查看>>
Python机器学习基础教程
查看>>
iOS的CocoaPods(activesupport requires Ruby version >= 2.2.2)
查看>>