Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
环境
Python 3.7.4
requests==2.22.0
bs4==0.0.1
xlwt==1.3.0
urllib3==1.24.2
re
初始化
def __init__(self):# URL前后缀self.urlPrefix = '/%E7%AE%97%E6%B3%95?start='self.urlSuffix = '&type=T'# 伪装浏览器headersself.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}self.books = []# 正则表达式编译self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')# 导出为XLS相关变量self.workbook = xlwt.Workbook(encoding='utf-8')self.sheet = self.workbook.add_sheet('Books')self.xlsPath = './Books.xls'
抓取网页
def crawlPage(self, url):# request请求request = urllib.request.Request(headers = self.headers, url = url)page = Nonetry:# 抓取HTML页面并解码response = urllib.request.urlopen(request)page = response.read().decode('utf-8')except urllib.error.URLError as e:# 出错处理print("Get page fail!")print(e)return page
获取总页数
def getTotalPages(self):# 拼接URLurl = self.urlPrefix + str(0) + self.urlSuffix# 获取网址page = self.crawlPage(url)# 提取数据,选择HTML解析器beautifulSoup = BeautifulSoup(page, 'html.parser')pageNumbers = []for subject in beautifulSoup.find_all('div', class_ = 'paginator'):subject = str(subject)# 正则表达式提取数据pageNumbers = re.findall(self.pagesPattern, subject)totalPageNumber = 0# 获取最大页码返回for pageNumber in pageNumbers:totalPageNumber = max(totalPageNumber, int(pageNumber))return totalPageNumber
提取数据
def extractData(self):totalPages = self.getTotalPages()for i in range(0, totalPages):# 拼接URLurl = self.urlPrefix + str(i * 20) + self.urlSuffix# 获取页面page = self.crawlPage(url)# 提取数据,选择HTML解析器beautifulSoup = BeautifulSoup(page, 'html.parser')for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):subject = str(subject)## 正则表达式提取数据book = []title = re.findall(self.titlePattern, subject)if len(title) > 0:title = title[0]else:title = 'Nothing'book.append(title)detailURL = re.findall(self.detailURLPattern, subject)[0]book.append(detailURL)imageURL = re.findall(self.imagePattern, subject)[0]book.append(imageURL)publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')book.append(publisher)rating = re.findall(self.ratingPattern, subject)if len(rating) > 0:rating = rating[0]else:rating = 'None'book.append(rating)evaluators = re.findall(self.evaluatorsPattern, subject)[0]book.append(evaluators)introduction = re.findall(self.introductionPattern, subject)if len(introduction) > 0:introduction = introduction[0]else:introduction = 'Nothing'book.append(introduction)purchaseLink = re.findall(self.purchaseLinkPattern, subject)if len(purchaseLink) > 1:purchaseLink = purchaseLink[1]else:purchaseLink = 'Nothing'book.append(purchaseLink)## 书籍相关信息加入列表self.books.append(book)
导出为XLS文件
def exportXLS(self):if (len(self.books) == 0):# 若未获得数据,提醒并返回print("Get data first")return# 列标columns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']# 写入列标for column in range(0, len(columns)):self.sheet.write(0, column, columns[column])# 每行写入对应数据信息for i in range(1, len(self.books) + 1):for column in range(0, len(columns)):self.sheet.write(i, column, self.books[i - 1][column])# 导出为XLS文件保存self.workbook.save(self.xlsPath)
实现代码
# -*- coding:utf-8 -*-
import re
from re import S, sub
from bs4 import BeautifulSoup
import xlwt
import urllib
# from urllib import requestclass Spider():def __init__(self):self.urlPrefix = '/%E7%AE%97%E6%B3%95?start='self.urlSuffix = '&type=T'self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}self.books = []self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')self.workbook = xlwt.Workbook(encoding='utf-8')self.sheet = self.workbook.add_sheet('Books')self.xlsPath = './Books.xls'def crawlPage(self, url):request = urllib.request.Request(headers = self.headers, url = url)page = Nonetry:response = urllib.request.urlopen(request)page = response.read().decode('utf-8')except urllib.error.URLError as e:print("Get page fail!")print(e)return pagedef extractData(self):totalPages = self.getTotalPages()for i in range(0, totalPages):url = self.urlPrefix + str(i * 20) + self.urlSuffixpage = self.crawlPage(url)beautifulSoup = BeautifulSoup(page, 'html.parser')for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):subject = str(subject)book = []title = re.findall(self.titlePattern, subject)if len(title) > 0:title = title[0]else:title = 'Nothing'book.append(title)detailURL = re.findall(self.detailURLPattern, subject)[0]book.append(detailURL)imageURL = re.findall(self.imagePattern, subject)[0]book.append(imageURL)publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')book.append(publisher)rating = re.findall(self.ratingPattern, subject)if len(rating) > 0:rating = rating[0]else:rating = 'None'book.append(rating)evaluators = re.findall(self.evaluatorsPattern, subject)[0]book.append(evaluators)introduction = re.findall(self.introductionPattern, subject)if len(introduction) > 0:introduction = introduction[0]else:introduction = 'Nothing'book.append(introduction)purchaseLink = re.findall(self.purchaseLinkPattern, subject)if len(purchaseLink) > 1:purchaseLink = purchaseLink[1]else:purchaseLink = 'Nothing'book.append(purchaseLink)self.books.append(book)def getTotalPages(self):url = self.urlPrefix + str(0) + self.urlSuffixpage = self.crawlPage(url)beautifulSoup = BeautifulSoup(page, 'html.parser')pageNumbers = []for subject in beautifulSoup.find_all('div', class_ = 'paginator'):subject = str(subject)pageNumbers = re.findall(self.pagesPattern, subject)totalPageNumber = 0for pageNumber in pageNumbers:totalPageNumber = max(totalPageNumber, int(pageNumber))return totalPageNumberdef exportXLS(self):if (len(self.books) == 0):print("Get data first")returncolumns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']for column in range(0, len(columns)):self.sheet.write(0, column, columns[column])for i in range(1, len(self.books) + 1):for column in range(0, len(columns)):self.sheet.write(i, column, self.books[i - 1][column])self.workbook.save(self.xlsPath)if __name__ == "__main__":spider = Spider()spider.extractData()spider.exportXLS()# spider.getTotalPage()
输出结果
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
Python爬虫抓取豆瓣算法类书籍综合排名导出为XLS文件
环境
Python 3.7.4
requests==2.22.0
bs4==0.0.1
xlwt==1.3.0
urllib3==1.24.2
re
初始化
def __init__(self):# URL前后缀self.urlPrefix = '/%E7%AE%97%E6%B3%95?start='self.urlSuffix = '&type=T'# 伪装浏览器headersself.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}self.books = []# 正则表达式编译self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')# 导出为XLS相关变量self.workbook = xlwt.Workbook(encoding='utf-8')self.sheet = self.workbook.add_sheet('Books')self.xlsPath = './Books.xls'
抓取网页
def crawlPage(self, url):# request请求request = urllib.request.Request(headers = self.headers, url = url)page = Nonetry:# 抓取HTML页面并解码response = urllib.request.urlopen(request)page = response.read().decode('utf-8')except urllib.error.URLError as e:# 出错处理print("Get page fail!")print(e)return page
获取总页数
def getTotalPages(self):# 拼接URLurl = self.urlPrefix + str(0) + self.urlSuffix# 获取网址page = self.crawlPage(url)# 提取数据,选择HTML解析器beautifulSoup = BeautifulSoup(page, 'html.parser')pageNumbers = []for subject in beautifulSoup.find_all('div', class_ = 'paginator'):subject = str(subject)# 正则表达式提取数据pageNumbers = re.findall(self.pagesPattern, subject)totalPageNumber = 0# 获取最大页码返回for pageNumber in pageNumbers:totalPageNumber = max(totalPageNumber, int(pageNumber))return totalPageNumber
提取数据
def extractData(self):totalPages = self.getTotalPages()for i in range(0, totalPages):# 拼接URLurl = self.urlPrefix + str(i * 20) + self.urlSuffix# 获取页面page = self.crawlPage(url)# 提取数据,选择HTML解析器beautifulSoup = BeautifulSoup(page, 'html.parser')for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):subject = str(subject)## 正则表达式提取数据book = []title = re.findall(self.titlePattern, subject)if len(title) > 0:title = title[0]else:title = 'Nothing'book.append(title)detailURL = re.findall(self.detailURLPattern, subject)[0]book.append(detailURL)imageURL = re.findall(self.imagePattern, subject)[0]book.append(imageURL)publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')book.append(publisher)rating = re.findall(self.ratingPattern, subject)if len(rating) > 0:rating = rating[0]else:rating = 'None'book.append(rating)evaluators = re.findall(self.evaluatorsPattern, subject)[0]book.append(evaluators)introduction = re.findall(self.introductionPattern, subject)if len(introduction) > 0:introduction = introduction[0]else:introduction = 'Nothing'book.append(introduction)purchaseLink = re.findall(self.purchaseLinkPattern, subject)if len(purchaseLink) > 1:purchaseLink = purchaseLink[1]else:purchaseLink = 'Nothing'book.append(purchaseLink)## 书籍相关信息加入列表self.books.append(book)
导出为XLS文件
def exportXLS(self):if (len(self.books) == 0):# 若未获得数据,提醒并返回print("Get data first")return# 列标columns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']# 写入列标for column in range(0, len(columns)):self.sheet.write(0, column, columns[column])# 每行写入对应数据信息for i in range(1, len(self.books) + 1):for column in range(0, len(columns)):self.sheet.write(i, column, self.books[i - 1][column])# 导出为XLS文件保存self.workbook.save(self.xlsPath)
实现代码
# -*- coding:utf-8 -*-
import re
from re import S, sub
from bs4 import BeautifulSoup
import xlwt
import urllib
# from urllib import requestclass Spider():def __init__(self):self.urlPrefix = '/%E7%AE%97%E6%B3%95?start='self.urlSuffix = '&type=T'self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}self.books = []self.detailURLPattern = re.compile(r'<a href="(.*?)".*>')self.titlePattern = re.compile(r'<a href=.*title="(.*?)">')self.imagePattern = re.compile(r'<img class="" src="(.*?)" .*>')self.publisherPattern = re.compile(r'<div class="pub">(.*?)</div>', re.S)self.ratingPattern = re.compile(r'<span class="rating_nums">(.*?)</span>')self.evaluatorsPattern = re.compile(r'<span class="pl">.*?(\d*?)人评价.*?</span>', re.S)self.introductionPattern = re.compile(r'<p>(.*?)</p>', re.S)self.purchaseLinkPattern = re.compile(r'<a href="(.*?)".*?</a>', re.S)self.pagesPattern = re.compile(r'<a href="/tag/.*?&type=T">(\d*?)</a>')self.workbook = xlwt.Workbook(encoding='utf-8')self.sheet = self.workbook.add_sheet('Books')self.xlsPath = './Books.xls'def crawlPage(self, url):request = urllib.request.Request(headers = self.headers, url = url)page = Nonetry:response = urllib.request.urlopen(request)page = response.read().decode('utf-8')except urllib.error.URLError as e:print("Get page fail!")print(e)return pagedef extractData(self):totalPages = self.getTotalPages()for i in range(0, totalPages):url = self.urlPrefix + str(i * 20) + self.urlSuffixpage = self.crawlPage(url)beautifulSoup = BeautifulSoup(page, 'html.parser')for subject in beautifulSoup.find_all('li', class_ = 'subject-item'):subject = str(subject)book = []title = re.findall(self.titlePattern, subject)if len(title) > 0:title = title[0]else:title = 'Nothing'book.append(title)detailURL = re.findall(self.detailURLPattern, subject)[0]book.append(detailURL)imageURL = re.findall(self.imagePattern, subject)[0]book.append(imageURL)publisher = str(re.findall(self.publisherPattern, subject)[0]).replace(' ', '').replace('\n', '')book.append(publisher)rating = re.findall(self.ratingPattern, subject)if len(rating) > 0:rating = rating[0]else:rating = 'None'book.append(rating)evaluators = re.findall(self.evaluatorsPattern, subject)[0]book.append(evaluators)introduction = re.findall(self.introductionPattern, subject)if len(introduction) > 0:introduction = introduction[0]else:introduction = 'Nothing'book.append(introduction)purchaseLink = re.findall(self.purchaseLinkPattern, subject)if len(purchaseLink) > 1:purchaseLink = purchaseLink[1]else:purchaseLink = 'Nothing'book.append(purchaseLink)self.books.append(book)def getTotalPages(self):url = self.urlPrefix + str(0) + self.urlSuffixpage = self.crawlPage(url)beautifulSoup = BeautifulSoup(page, 'html.parser')pageNumbers = []for subject in beautifulSoup.find_all('div', class_ = 'paginator'):subject = str(subject)pageNumbers = re.findall(self.pagesPattern, subject)totalPageNumber = 0for pageNumber in pageNumbers:totalPageNumber = max(totalPageNumber, int(pageNumber))return totalPageNumberdef exportXLS(self):if (len(self.books) == 0):print("Get data first")returncolumns = ['title', 'detailURL', 'imageURL', 'publisher', 'rating', 'evaluators', 'introduction', 'purchaseLink']for column in range(0, len(columns)):self.sheet.write(0, column, columns[column])for i in range(1, len(self.books) + 1):for column in range(0, len(columns)):self.sheet.write(i, column, self.books[i - 1][column])self.workbook.save(self.xlsPath)if __name__ == "__main__":spider = Spider()spider.extractData()spider.exportXLS()# spider.getTotalPage()
输出结果
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
发布评论