scrapy爬取招聘网站数据总结

遇到的问题:

测试阶段 (提取规则):scrapy shell 网址

一直 REDIRECT <302> 被重定向 需要提供 cookies 由于在terminal一直设置错误,也就没折腾。后来在pycharm设置了cookies 如下:

custom_settings = {

"COOKIES_ENABLED": False,

"DOWNLOAD_DELAY": 1,

'DEFAULT_REQUEST_HEADERS': {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.8',

'Connection': 'keep-alive',

'Cookie':'user_trace_token=20180809145407-031d6690-9ba1-11e8-b9cd-525400f775ce; LGUID=20180809145407-031d6aa7-9ba1-11e8-b9cd-525400f775ce; LG_LOGIN_USER_ID=1d2009da1adcd1e1b3df90ae486e0a6ed6f927d9d0ad0806; JSESSIONID=ABAAABAAADEAAFIAE5968C60EFF2D413ECDB2FF8F13ABCB; WEBTJ-ID=20180813160154-165324e448fcc9-071b34d843c314-102c1709-1049088-165324e4490cd2; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu; PRE_SITE=https%3A%2F%2Fwww.baidu%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xb29d323700012351%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D1%26rsv_sug1%3D1%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E5%25258B%2525BE%26rsp%3D0%26inputT%3D1498%26rsv_sug4%3D1498; PRE_LAND=https%3A%2F%2Fwww.lagou%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; _putrc=3D7BB7F7DC9382C5; login=true; unick=mzttts; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=221b6ccc998948d8s8ae5db7131c21fc68d1b90499eea7d6; TG-TRACK-CODE=index_navigation; SEARCH_ID=f386beab95164a8e8e860aa82b4bd751; index_location_city=%E5%8C%97%E4%BA%AC; _gid=GA1.2.1333500619.1534062851; _ga=GA1.2.504828216.1533797648; LGSID=20180813160155-24f4dafe-9ecf-11e8-bb57-525400f775ce; LGRID=20180813160218-331abb23-9ecf-11e8-a37b-5254005c3644; Hm_lvt_4233e74dff0ae5bccf756e6=15337d0a3d81c697648,1534062851,1534147148,1534147316; Hm_lpvt_4233e74dff0a81c6ccf756e5bd0a3de6=1534147348',

'Host': 'www.lagou',

'Origin': '',

'Referer': '/',

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',

}

}

以上cookies可以获取到URL页面内容。

测试发现 由于要提取规则 需要进行多次测试 不可能每次都去直接跑整个程序 折腾发现两种方式:

一。 在pycharm下的`terminal 可以进行获取到网页内容 不会被重定向 个人认为是上面的cookies发挥了作用

 

二。 使用xpath进行单独测试 即将所要爬取的内容找一个代表页面 拷贝网页源代码 在terminal进行测试

举个栗子:

第二种好使些。

下面开始正文:

1. scrapy startproject ArticleSpider 【创建项目】

2. 【前提:命令行下 进入项目目录】 scrapy genspider -t crawl lagou www.lagou 或 scrapy genspider lagou www.lagou

前一个是使用了模板

目录结构:

大致程序跳转结构【个人主观】:

lagou.py【起始网址含规则】------> item 【处理数据,清洗数据】 -------->setting【】 ------->pipeline【存储数据】

3. 编写 爬取规则 lagou.py

# -*- coding: utf-8 -*-from scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom items import LagouJobItem,LagouJobItemLoaderfrom utilsmon import *from datetime import datetimeclass LagouSpider(CrawlSpider):name = 'lagou' # 爬虫名 用以区分不同爬虫allowed_domains = ['www.lagou'] # 待爬取网站域名start_urls = ['/'] # 待爬取网址# 自定义settings 使用 custom_settingscustom_settings = {"COOKIES_ENABLED": False,"DOWNLOAD_DELAY": 1,'DEFAULT_REQUEST_HEADERS': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.8','Connection': 'keep-alive','Cookie':'user_trace_token=20180809145407-031d6690-9ba1-11e8-b9cd-525400f775ce; LGUID=20180809145407-031d6aa7-9ba1-11e8-b9cd-525400f775ce; LG_LOGIN_USER_ID=1d2009da1adcd1e1b3df90ae486e0a6ed6f927d9d0ad0806; JSESSIONID=ABAAABAAADEAAFIAE5968C60EFF2D413ECDB2FF8F13ABCB; WEBTJ-ID=20180813160154-165324e448fcc9-071b34d843c314-102c1709-1049088-165324e4490cd2; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu; PRE_SITE=https%3A%2F%2Fwww.baidu%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xb29d323700012351%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D1%26rsv_sug1%3D1%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E5%25258B%2525BE%26rsp%3D0%26inputT%3D1498%26rsv_sug4%3D1498; PRE_LAND=https%3A%2F%2Fwww.lagou%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; _putrc=3D7BB7F7DC9382C5; login=true; unick=mttrtrehe; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=221b6ccc9b991fc68d10499489e5db7131c283888aeea7d6; TG-TRACK-CODE=index_navigation; SEARCH_ID=f386beab95164a8e8e860aa82b4bd751; index_location_city=%E5%8C%97%E4%BA%AC; _gid=GA1.2.1333500619.1534062851; _ga=GA1.2.504828216.1533797648; LGSID=20180813160155-24f4dafe-9ecf-11e8-bb57-525400f775ce; LGRID=20180813160218-331abb23-9ecf-11e8-a37b-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1533797648,1534062851,1534147148,1534147316; Hm_lpvt_4233e7bd0a3d81c4dff0ae56ccf756e6=1534147348','Host': 'www.lagou','Origin': '','Referer': '/','User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',}}# 规则rules = ("""LinkExtractor 筛选链接 follow 在这条规则下,我只会爬取定义的start_urls中的和规则符合的链接。假设我把follow修改为 True,那么爬虫会start_urls爬取的页面中在寻找符合规则的url,如此循环,直到把全站爬取完毕。 rule无论有无callback,都由同一个_parse_response函数处理,只不过他会判断是否有follow和callback注意:通过restrict_xpath来限定只从页面特定的部分来抽取接下来将要爬取的链接。""" Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),Rule(LinkExtractor(allow=(r'gongsi/j\d+.html',)), follow=True),Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),)# 解析规则 即获取我们需要的指定内容def parse_job(self, response):# title = response.css(".job-name::attr(title)")# print(title.extract())# 通过自建的LagouItemLoader创造新的item_loader# 其中LagouJobItemLoader继承自ItemLoader# LagouJobItem是我们自定义的 一个处理获取到的数据的类item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)# add_css/add_xpath_add_value 第一个参数是 定义一个变量名 第二个参数是变量要保存的内容item_loader.add_css('title', '.job-name::attr(title)')item_loader.add_value("url", response.url)item_loader.add_value("url_object_id", get_md5(response.url))item_loader.add_css('salary', '.salary::text')item_loader.add_css('job_city', '.job_request span:nth-child(2)::text') # ['/北京 /']item_loader.add_css('work_years', '.job_request span:nth-child(3)::text') # ['经验不限 /']item_loader.add_css('degree_need', '.job_request span:nth-child(4)::text') # ['本科及以上 /']item_loader.add_css('job_type', '.job_request span:nth-child(5)::text') # ['全职']item_loader.add_css('publish_time', '.publish_time::text') # ['3天前\xa0 发布于拉勾网']item_loader.add_css('tags', '.position-label.clearfix .labels::text') #['电商', 'Java', '软件开发']item_loader.add_css('job_advantage', '.job-advantage p::text') # ['大型国企,快速成长,年轻团队,扁平化管理']item_loader.add_xpath('job_desc', '//*[@class="job_bt"]/div') # ['负责本公司JAVA软件产品的设计、开发、实施和项目管理。']item_loader.add_xpath('job_addr', '//*[@class="work_addr"]/a/text()') # ['北京', '大兴区', '马驹桥', '查看地图']item_loader.add_xpath('company_url', '//*[@class="c_feature"]/li[4 or 5]/a/@href') # ['']item_loader.add_xpath('company_name', '//*[@class="job_company"]/dt/a/img/@alt') # ['中金金融认证中心有限公司']item_loader.add_value('crawl_time', datetime.now())# print("url_xpath: ", response.xpath('//*[@class="c_feature"]/li[4]/a/@href'))# print("url_css: ", response.css('#job_company dt a::attr(href)'))# 保存数据 将数据读取job_item = item_loader.load_item()# 返回item 交由item pipelin进行处理return job_item

4 . item.py 文件 包含定义模型 处理数据 

定义要爬取网站所需数据的模型 主要方便持久化保存

import scrapyfrom scrapy.loader import ItemLoaderfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinfrom settings import SQL_DATE_FORMAT, SQL_DATETIME_FORMATimport datetimeimport refrom w3lib.html import remove_tags# 这里即是我们之前自定义的处理item的类 定制化处理item 由于在lagou.py规则中提取的都是数组# 每次默认处理第一值 取第一个 第一个也正是我们需要的class LagouJobItemLoader(ItemLoader):default_output_processor = TakeFirst() # 默认取第一个def replace_splash(value):return value.replace("/", " ") # ['/ 北京 /'] 去掉前后的 “ / ”def handle_jobaddr(value):addr_list = value.split("\n")addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]return "".join(addr_list)def handle_strip(value):return value.strip() # 去掉空格class LagouJobItem(scrapy.Item): # 定义的模型url = scrapy.Field()url_object_id = scrapy.Field()title = scrapy.Field()salary = scrapy.Field()job_city = scrapy.Field(input_processor = MapCompose(replace_splash) # MapCompose 可以将job_city交给函数处理 其可以有多个函数 只写函数名进行调用# input_processor 表示对输入进行处理 job_city在收到数据时进行处理)work_years = scrapy.Field(input_processor=MapCompose(replace_splash))degree_need = scrapy.Field(input_processor=MapCompose(replace_splash))job_type = scrapy.Field()publish_time = scrapy.Field()tags = scrapy.Field()job_advantage = scrapy.Field()job_desc = scrapy.Field(input_processor=MapCompose(handle_strip))job_addr = scrapy.Field(input_processor=MapCompose(remove_tags, handle_jobaddr),)company_url = scrapy.Field()company_name = scrapy.Field(input_processor=MapCompose(handle_strip),)crawl_time = scrapy.Field()def get_insert_sql(self): # 该函数 用于处于sql语句insert_sql = """insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need,job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name,tags, crawl_time)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATEjob_desc=VALUES(job_desc)""" # 如果已存在将插入数据 就更新该数据的 job_desc# [Failure instance: Traceback: <class '_mysql_exceptions.ProgrammingError'>: not all arguments converted during string formatting# %s 并未完全匹配# ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc) 如果主键相同就更新job_desc# job_id = extract_num(self["url"])params = (self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"],self["degree_need"],self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"],self["company_url"],self["company_name"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),)return insert_sql, params # settings 将其交给pipline 处理【已使用pipline】 异步存储

5. piplines.py文件 存储数据 此处在settings 中设置仅使用异步处理的pipeline

import codecsimport jsonfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exporters import JsonItemExporterfrom twisted.enterprise import adbapi # 可以将MySQLdb的一些操作变成异步化操作import MySQLdbimport MySQLdb.cursorsclass MysqlTwistedPipeline(object):# 采用异步机制写入MySQLdef __init__(self, dppool):self.dppool = dppool# 用固定方法 【写法固定】 获取配置文件内信息@classmethoddef from_settings(cls, settings): # cls实际就是本类 MysqlTwistedPipelinedpparms = dict(host = settings["MYSQL_HOST"],db = settings["MYSQL_DBNAME"],user = settings["MYSQL_USER"],passwd = settings["MYSQL_PASSWORD"],charset = "utf8",cursorclass = MySQLdb.cursors.DictCursor, # 指定 curosr 类型 需要导入MySQLdb.cursorsuse_unicode = True) # 由于要传递参数 所以参数名成要与connnect保持一致# 用的仍是MySQLdb的库 twisted并不提供# 异步操作# adbapi # 可以将MySQLdb的一些操作变成异步化操作dppool = adbapi.ConnectionPool("MySQLdb", **dpparms) # 告诉它使用的是哪个数据库模块 连接参数return cls(dppool) # 即实例化一个pipelinedef process_item(self, item, spider):# 使用twisted将mysql插入编程异步操作# 指定操作方法和操作的数据 [下面会将方法异步处理]query = self.dppool.runInteraction(self.do_insert, item)# AttributeError: 'Deferred' object has no attribute 'addErrorback'# query.addErrorback(self.handle_error) # 处理异常"""Unhandled error in Deferred:2018-08-13 17:17:36 [twisted] CRITICAL: Unhandled error in Deferred:2018-08-13 17:17:36 [twisted] CRITICAL:Traceback (most recent call last):File "/home/maxinehehe/.virtualenvs/article_spider/lib/python3.5/site-packages/twisted/internet/defer.py", line 654, in _runCallbackscurrent.result = callback(current.result, *args, **kw)TypeError: handle_error() missing 2 required positional arguments: 'item' and 'spider'"""# query.addErrback(self.handle_error) # 处理异常 爬取拉勾出错的地方query.addErrback(self.handle_error, item, spider) # 如果 item 仍在此活动 就要处理 item 和 spider 可能再次引发的错误 更高层def handle_error(self, failure, item, spider):# 定义错误 处理异步插入的异常print(failure)def do_insert(self, cursor, item):"""此类内其他都可以看作是通用 针对不同的sql操作只需要改写这里即可了:param cursor::param item::return:"""# insert_sql = """insert into jobbole_article(title, url, create_date, fav_nums, url_object_id,# front_image_url, front_image_path, comment_nums, praise_nums, tags, content)# values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"""# # 使用`占位符# cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"],# item["url_object_id"], item["front_image_url"],# item["front_image_path"], item["comment_nums"], item["praise_nums"],# item["tags"],item["content"]))# 执行具体的插入# 根据不同的item 构建不同的sql语句并插入到mysql中insert_sql, params = item.get_insert_sql() # 获取sql语句print(insert_sql, params)cursor.execute(insert_sql, params) # 执行sql操作

最后 settings.py文件的设置如下:

# -*- coding: utf-8 -*-
import osBOT_NAME = 'ArticelSpider'SPIDER_MODULES = ['ArticelSpider.spiders']
NEWSPIDER_MODULE = 'ArticelSpider.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False  # False 防止过滤掉不符合机器爬取规则的URLDOWNLOAD_DELAY=2ITEM_PIPELINES = {'ArticelSpider.pipelines.MysqlTwistedPipeline': 3,  # 异步处理数据保存至数据库}# 配置item爬去字段
IMAGES_URLS_FIELD = "front_image_url"  # 需要注意的是 scrapy会将front_image_url当成数组处理 所以格式要正确
project_dir = os.path.abspath(os.path.dirname(__file__))
# 讲两个路径名链接 找到images文件
IMAGES_STORE = os.path.join(project_dir, 'images')   # 尽量配置相对路径import sys
# genspider时会出现导包问题
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(BASE_DIR, "ArticelSpider"))AUTOTHROTTLE_ENABLED = True
SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
SQL_DATE_FORMAT = "%Y-%m-%d"MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "article_spider"  # 数据库名称 非表名称 表名称会在sql语句中指明
MYSQL_USER = "root"
MYSQL_PASSWORD = "123456"

爬取结果数据库:

 

 

 

 

 

 

 

 

 

scrapy爬取招聘网站数据总结

遇到的问题:

测试阶段 (提取规则):scrapy shell 网址

一直 REDIRECT <302> 被重定向 需要提供 cookies 由于在terminal一直设置错误,也就没折腾。后来在pycharm设置了cookies 如下:

custom_settings = {

"COOKIES_ENABLED": False,

"DOWNLOAD_DELAY": 1,

'DEFAULT_REQUEST_HEADERS': {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.8',

'Connection': 'keep-alive',

'Cookie':'user_trace_token=20180809145407-031d6690-9ba1-11e8-b9cd-525400f775ce; LGUID=20180809145407-031d6aa7-9ba1-11e8-b9cd-525400f775ce; LG_LOGIN_USER_ID=1d2009da1adcd1e1b3df90ae486e0a6ed6f927d9d0ad0806; JSESSIONID=ABAAABAAADEAAFIAE5968C60EFF2D413ECDB2FF8F13ABCB; WEBTJ-ID=20180813160154-165324e448fcc9-071b34d843c314-102c1709-1049088-165324e4490cd2; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu; PRE_SITE=https%3A%2F%2Fwww.baidu%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xb29d323700012351%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D1%26rsv_sug1%3D1%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E5%25258B%2525BE%26rsp%3D0%26inputT%3D1498%26rsv_sug4%3D1498; PRE_LAND=https%3A%2F%2Fwww.lagou%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; _putrc=3D7BB7F7DC9382C5; login=true; unick=mzttts; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=221b6ccc998948d8s8ae5db7131c21fc68d1b90499eea7d6; TG-TRACK-CODE=index_navigation; SEARCH_ID=f386beab95164a8e8e860aa82b4bd751; index_location_city=%E5%8C%97%E4%BA%AC; _gid=GA1.2.1333500619.1534062851; _ga=GA1.2.504828216.1533797648; LGSID=20180813160155-24f4dafe-9ecf-11e8-bb57-525400f775ce; LGRID=20180813160218-331abb23-9ecf-11e8-a37b-5254005c3644; Hm_lvt_4233e74dff0ae5bccf756e6=15337d0a3d81c697648,1534062851,1534147148,1534147316; Hm_lpvt_4233e74dff0a81c6ccf756e5bd0a3de6=1534147348',

'Host': 'www.lagou',

'Origin': '',

'Referer': '/',

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',

}

}

以上cookies可以获取到URL页面内容。

测试发现 由于要提取规则 需要进行多次测试 不可能每次都去直接跑整个程序 折腾发现两种方式:

一。 在pycharm下的`terminal 可以进行获取到网页内容 不会被重定向 个人认为是上面的cookies发挥了作用

 

二。 使用xpath进行单独测试 即将所要爬取的内容找一个代表页面 拷贝网页源代码 在terminal进行测试

举个栗子:

第二种好使些。

下面开始正文:

1. scrapy startproject ArticleSpider 【创建项目】

2. 【前提:命令行下 进入项目目录】 scrapy genspider -t crawl lagou www.lagou 或 scrapy genspider lagou www.lagou

前一个是使用了模板

目录结构:

大致程序跳转结构【个人主观】:

lagou.py【起始网址含规则】------> item 【处理数据,清洗数据】 -------->setting【】 ------->pipeline【存储数据】

3. 编写 爬取规则 lagou.py

# -*- coding: utf-8 -*-from scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom items import LagouJobItem,LagouJobItemLoaderfrom utilsmon import *from datetime import datetimeclass LagouSpider(CrawlSpider):name = 'lagou' # 爬虫名 用以区分不同爬虫allowed_domains = ['www.lagou'] # 待爬取网站域名start_urls = ['/'] # 待爬取网址# 自定义settings 使用 custom_settingscustom_settings = {"COOKIES_ENABLED": False,"DOWNLOAD_DELAY": 1,'DEFAULT_REQUEST_HEADERS': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.8','Connection': 'keep-alive','Cookie':'user_trace_token=20180809145407-031d6690-9ba1-11e8-b9cd-525400f775ce; LGUID=20180809145407-031d6aa7-9ba1-11e8-b9cd-525400f775ce; LG_LOGIN_USER_ID=1d2009da1adcd1e1b3df90ae486e0a6ed6f927d9d0ad0806; JSESSIONID=ABAAABAAADEAAFIAE5968C60EFF2D413ECDB2FF8F13ABCB; WEBTJ-ID=20180813160154-165324e448fcc9-071b34d843c314-102c1709-1049088-165324e4490cd2; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu; PRE_SITE=https%3A%2F%2Fwww.baidu%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xb29d323700012351%26issp%3D1%26f%3D3%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D1%26rsv_sug1%3D1%26rsv_sug7%3D100%26rsv_sug2%3D0%26prefixsug%3D%2525E6%25258B%252589%2525E5%25258B%2525BE%26rsp%3D0%26inputT%3D1498%26rsv_sug4%3D1498; PRE_LAND=https%3A%2F%2Fwww.lagou%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; _putrc=3D7BB7F7DC9382C5; login=true; unick=mttrtrehe; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=221b6ccc9b991fc68d10499489e5db7131c283888aeea7d6; TG-TRACK-CODE=index_navigation; SEARCH_ID=f386beab95164a8e8e860aa82b4bd751; index_location_city=%E5%8C%97%E4%BA%AC; _gid=GA1.2.1333500619.1534062851; _ga=GA1.2.504828216.1533797648; LGSID=20180813160155-24f4dafe-9ecf-11e8-bb57-525400f775ce; LGRID=20180813160218-331abb23-9ecf-11e8-a37b-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1533797648,1534062851,1534147148,1534147316; Hm_lpvt_4233e7bd0a3d81c4dff0ae56ccf756e6=1534147348','Host': 'www.lagou','Origin': '','Referer': '/','User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',}}# 规则rules = ("""LinkExtractor 筛选链接 follow 在这条规则下,我只会爬取定义的start_urls中的和规则符合的链接。假设我把follow修改为 True,那么爬虫会start_urls爬取的页面中在寻找符合规则的url,如此循环,直到把全站爬取完毕。 rule无论有无callback,都由同一个_parse_response函数处理,只不过他会判断是否有follow和callback注意:通过restrict_xpath来限定只从页面特定的部分来抽取接下来将要爬取的链接。""" Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),Rule(LinkExtractor(allow=(r'gongsi/j\d+.html',)), follow=True),Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),)# 解析规则 即获取我们需要的指定内容def parse_job(self, response):# title = response.css(".job-name::attr(title)")# print(title.extract())# 通过自建的LagouItemLoader创造新的item_loader# 其中LagouJobItemLoader继承自ItemLoader# LagouJobItem是我们自定义的 一个处理获取到的数据的类item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)# add_css/add_xpath_add_value 第一个参数是 定义一个变量名 第二个参数是变量要保存的内容item_loader.add_css('title', '.job-name::attr(title)')item_loader.add_value("url", response.url)item_loader.add_value("url_object_id", get_md5(response.url))item_loader.add_css('salary', '.salary::text')item_loader.add_css('job_city', '.job_request span:nth-child(2)::text') # ['/北京 /']item_loader.add_css('work_years', '.job_request span:nth-child(3)::text') # ['经验不限 /']item_loader.add_css('degree_need', '.job_request span:nth-child(4)::text') # ['本科及以上 /']item_loader.add_css('job_type', '.job_request span:nth-child(5)::text') # ['全职']item_loader.add_css('publish_time', '.publish_time::text') # ['3天前\xa0 发布于拉勾网']item_loader.add_css('tags', '.position-label.clearfix .labels::text') #['电商', 'Java', '软件开发']item_loader.add_css('job_advantage', '.job-advantage p::text') # ['大型国企,快速成长,年轻团队,扁平化管理']item_loader.add_xpath('job_desc', '//*[@class="job_bt"]/div') # ['负责本公司JAVA软件产品的设计、开发、实施和项目管理。']item_loader.add_xpath('job_addr', '//*[@class="work_addr"]/a/text()') # ['北京', '大兴区', '马驹桥', '查看地图']item_loader.add_xpath('company_url', '//*[@class="c_feature"]/li[4 or 5]/a/@href') # ['']item_loader.add_xpath('company_name', '//*[@class="job_company"]/dt/a/img/@alt') # ['中金金融认证中心有限公司']item_loader.add_value('crawl_time', datetime.now())# print("url_xpath: ", response.xpath('//*[@class="c_feature"]/li[4]/a/@href'))# print("url_css: ", response.css('#job_company dt a::attr(href)'))# 保存数据 将数据读取job_item = item_loader.load_item()# 返回item 交由item pipelin进行处理return job_item

4 . item.py 文件 包含定义模型 处理数据 

定义要爬取网站所需数据的模型 主要方便持久化保存

import scrapyfrom scrapy.loader import ItemLoaderfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinfrom settings import SQL_DATE_FORMAT, SQL_DATETIME_FORMATimport datetimeimport refrom w3lib.html import remove_tags# 这里即是我们之前自定义的处理item的类 定制化处理item 由于在lagou.py规则中提取的都是数组# 每次默认处理第一值 取第一个 第一个也正是我们需要的class LagouJobItemLoader(ItemLoader):default_output_processor = TakeFirst() # 默认取第一个def replace_splash(value):return value.replace("/", " ") # ['/ 北京 /'] 去掉前后的 “ / ”def handle_jobaddr(value):addr_list = value.split("\n")addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"]return "".join(addr_list)def handle_strip(value):return value.strip() # 去掉空格class LagouJobItem(scrapy.Item): # 定义的模型url = scrapy.Field()url_object_id = scrapy.Field()title = scrapy.Field()salary = scrapy.Field()job_city = scrapy.Field(input_processor = MapCompose(replace_splash) # MapCompose 可以将job_city交给函数处理 其可以有多个函数 只写函数名进行调用# input_processor 表示对输入进行处理 job_city在收到数据时进行处理)work_years = scrapy.Field(input_processor=MapCompose(replace_splash))degree_need = scrapy.Field(input_processor=MapCompose(replace_splash))job_type = scrapy.Field()publish_time = scrapy.Field()tags = scrapy.Field()job_advantage = scrapy.Field()job_desc = scrapy.Field(input_processor=MapCompose(handle_strip))job_addr = scrapy.Field(input_processor=MapCompose(remove_tags, handle_jobaddr),)company_url = scrapy.Field()company_name = scrapy.Field(input_processor=MapCompose(handle_strip),)crawl_time = scrapy.Field()def get_insert_sql(self): # 该函数 用于处于sql语句insert_sql = """insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need,job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name,tags, crawl_time)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATEjob_desc=VALUES(job_desc)""" # 如果已存在将插入数据 就更新该数据的 job_desc# [Failure instance: Traceback: <class '_mysql_exceptions.ProgrammingError'>: not all arguments converted during string formatting# %s 并未完全匹配# ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc) 如果主键相同就更新job_desc# job_id = extract_num(self["url"])params = (self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"],self["degree_need"],self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"],self["company_url"],self["company_name"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),)return insert_sql, params # settings 将其交给pipline 处理【已使用pipline】 异步存储

5. piplines.py文件 存储数据 此处在settings 中设置仅使用异步处理的pipeline

import codecsimport jsonfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exporters import JsonItemExporterfrom twisted.enterprise import adbapi # 可以将MySQLdb的一些操作变成异步化操作import MySQLdbimport MySQLdb.cursorsclass MysqlTwistedPipeline(object):# 采用异步机制写入MySQLdef __init__(self, dppool):self.dppool = dppool# 用固定方法 【写法固定】 获取配置文件内信息@classmethoddef from_settings(cls, settings): # cls实际就是本类 MysqlTwistedPipelinedpparms = dict(host = settings["MYSQL_HOST"],db = settings["MYSQL_DBNAME"],user = settings["MYSQL_USER"],passwd = settings["MYSQL_PASSWORD"],charset = "utf8",cursorclass = MySQLdb.cursors.DictCursor, # 指定 curosr 类型 需要导入MySQLdb.cursorsuse_unicode = True) # 由于要传递参数 所以参数名成要与connnect保持一致# 用的仍是MySQLdb的库 twisted并不提供# 异步操作# adbapi # 可以将MySQLdb的一些操作变成异步化操作dppool = adbapi.ConnectionPool("MySQLdb", **dpparms) # 告诉它使用的是哪个数据库模块 连接参数return cls(dppool) # 即实例化一个pipelinedef process_item(self, item, spider):# 使用twisted将mysql插入编程异步操作# 指定操作方法和操作的数据 [下面会将方法异步处理]query = self.dppool.runInteraction(self.do_insert, item)# AttributeError: 'Deferred' object has no attribute 'addErrorback'# query.addErrorback(self.handle_error) # 处理异常"""Unhandled error in Deferred:2018-08-13 17:17:36 [twisted] CRITICAL: Unhandled error in Deferred:2018-08-13 17:17:36 [twisted] CRITICAL:Traceback (most recent call last):File "/home/maxinehehe/.virtualenvs/article_spider/lib/python3.5/site-packages/twisted/internet/defer.py", line 654, in _runCallbackscurrent.result = callback(current.result, *args, **kw)TypeError: handle_error() missing 2 required positional arguments: 'item' and 'spider'"""# query.addErrback(self.handle_error) # 处理异常 爬取拉勾出错的地方query.addErrback(self.handle_error, item, spider) # 如果 item 仍在此活动 就要处理 item 和 spider 可能再次引发的错误 更高层def handle_error(self, failure, item, spider):# 定义错误 处理异步插入的异常print(failure)def do_insert(self, cursor, item):"""此类内其他都可以看作是通用 针对不同的sql操作只需要改写这里即可了:param cursor::param item::return:"""# insert_sql = """insert into jobbole_article(title, url, create_date, fav_nums, url_object_id,# front_image_url, front_image_path, comment_nums, praise_nums, tags, content)# values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )"""# # 使用`占位符# cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"],# item["url_object_id"], item["front_image_url"],# item["front_image_path"], item["comment_nums"], item["praise_nums"],# item["tags"],item["content"]))# 执行具体的插入# 根据不同的item 构建不同的sql语句并插入到mysql中insert_sql, params = item.get_insert_sql() # 获取sql语句print(insert_sql, params)cursor.execute(insert_sql, params) # 执行sql操作

最后 settings.py文件的设置如下:

# -*- coding: utf-8 -*-
import osBOT_NAME = 'ArticelSpider'SPIDER_MODULES = ['ArticelSpider.spiders']
NEWSPIDER_MODULE = 'ArticelSpider.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False  # False 防止过滤掉不符合机器爬取规则的URLDOWNLOAD_DELAY=2ITEM_PIPELINES = {'ArticelSpider.pipelines.MysqlTwistedPipeline': 3,  # 异步处理数据保存至数据库}# 配置item爬去字段
IMAGES_URLS_FIELD = "front_image_url"  # 需要注意的是 scrapy会将front_image_url当成数组处理 所以格式要正确
project_dir = os.path.abspath(os.path.dirname(__file__))
# 讲两个路径名链接 找到images文件
IMAGES_STORE = os.path.join(project_dir, 'images')   # 尽量配置相对路径import sys
# genspider时会出现导包问题
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(BASE_DIR, "ArticelSpider"))AUTOTHROTTLE_ENABLED = True
SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
SQL_DATE_FORMAT = "%Y-%m-%d"MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "article_spider"  # 数据库名称 非表名称 表名称会在sql语句中指明
MYSQL_USER = "root"
MYSQL_PASSWORD = "123456"

爬取结果数据库: