Python百度图片批量下载器
Python百度图片批量下载器
环境
Python 3.7.4
urllib3==1.25.10
re
os
抓取页面
从/
抓取想要的页面
def crawlPages(self):pages = []for i in range(0 , self.number):url = self.url + ('&pn=%d' % i * self.offset)request = urllib.request.Request(headers = self.headers, url = url)response = urllib.request.urlopen(request)page = response.read().decode('utf-8')pages.append(page)return pages
提取图片URL
利用re
库提取图片的URL
,观察获取的页面,选择正则表达式为self.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)
def extractLinks(self, pages):linkList = []for page in pages:links = re.findall(self.pattern, str(page))for link in links:if link is not None and link != '':linkList.append(link)return linkList
下载图片
利用urllib.request.urlretrieve()
下载图片并保存到指定路径,需要from urllib import request
,由于版本问题,不能使用urllib.request
,由于用了request
作为变量名,所以导入包语句改为from urllib import request as req
def download(self, link, filename):try:print("downloading...")req.urlretrieve(str(link), filename = filename)print("downloaded succesfully")except urllib.ContentTooShortError:print("retrying...")self.download(link, filename)
实现代码
# -*- coding:utf-8 -*-
import re
import urllib
from urllib import request as req
import osclass Downloader():'''Description:Downloader to download picture from / in batchesAttributes:keyword:the keyword of the pictures you wanna downloadpath:the path you wanna to save picture downloadednumber:the number of pages you wanna download'''def __init__(self, keyword, path, number):self.url = '=baiduimage&ie=utf-8&word=%s' % urllib.parse.quote(keyword)# regular expressionself.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}self.path = pathself.createDirectory()self.number = numberself.offset = 20'''Description:create the directory to save picturesArgs:NoneReturns:None'''def createDirectory(self):if not os.path.exists(self.path):os.mkdir(self.path)'''Description:crawl HTML pagesArgs:NoneReturns:pages : HTML pages'''def crawlPages(self):pages = []for i in range(0 , self.number):url = self.url + ('&pn=%d' % i * self.offset)request = urllib.request.Request(headers = self.headers, url = url)response = urllib.request.urlopen(request)page = response.read().decode('utf-8')pages.append(page)return pages'''Description:extract links from pagesArgs:pages:HTML pagesReturns:linkList:the list of links extract from pages by regular expression'''def extractLinks(self, pages):linkList = []for page in pages:links = re.findall(self.pattern, str(page))for link in links:if link is not None and link != '':linkList.append(link)return linkList'''Description:download picture according to link, and save the picture in the given path and named after filenameArgs:link:link of picturefilename:including path and filename'''def download(self, link, filename):try:print("downloading...")req.urlretrieve(str(link), filename = filename)print("downloaded succesfully")except urllib.ContentTooShortError:print("retrying...")self.download(link, filename)'''Description:download all picture according to links in linkListArgs:linkList:the list of pictures' linksReturns:None'''def downloadAll(self, linkList):for link in linkList:filename = self.path + '/%d.jpg' % linkList.index(link)self.download(link, filename)'''Description:start to downloadArgs:NoneReturns:None'''def startDownload(self):pages = self.crawlPages()linkList = self.extractLinks(pages)self.downloadAll(linkList)if __name__ == "__main__":keyword = input("Please input the keyword of picture you wanna download:")path = './' + input("Please input the path of pictures you wanna save:")number = int(input("Please input the number of page you wanna get:"))downloader = Downloader(keyword, path, number)downloader.startDownload()
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
Python百度图片批量下载器
Python百度图片批量下载器
环境
Python 3.7.4
urllib3==1.25.10
re
os
抓取页面
从/
抓取想要的页面
def crawlPages(self):pages = []for i in range(0 , self.number):url = self.url + ('&pn=%d' % i * self.offset)request = urllib.request.Request(headers = self.headers, url = url)response = urllib.request.urlopen(request)page = response.read().decode('utf-8')pages.append(page)return pages
提取图片URL
利用re
库提取图片的URL
,观察获取的页面,选择正则表达式为self.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)
def extractLinks(self, pages):linkList = []for page in pages:links = re.findall(self.pattern, str(page))for link in links:if link is not None and link != '':linkList.append(link)return linkList
下载图片
利用urllib.request.urlretrieve()
下载图片并保存到指定路径,需要from urllib import request
,由于版本问题,不能使用urllib.request
,由于用了request
作为变量名,所以导入包语句改为from urllib import request as req
def download(self, link, filename):try:print("downloading...")req.urlretrieve(str(link), filename = filename)print("downloaded succesfully")except urllib.ContentTooShortError:print("retrying...")self.download(link, filename)
实现代码
# -*- coding:utf-8 -*-
import re
import urllib
from urllib import request as req
import osclass Downloader():'''Description:Downloader to download picture from / in batchesAttributes:keyword:the keyword of the pictures you wanna downloadpath:the path you wanna to save picture downloadednumber:the number of pages you wanna download'''def __init__(self, keyword, path, number):self.url = '=baiduimage&ie=utf-8&word=%s' % urllib.parse.quote(keyword)# regular expressionself.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}self.path = pathself.createDirectory()self.number = numberself.offset = 20'''Description:create the directory to save picturesArgs:NoneReturns:None'''def createDirectory(self):if not os.path.exists(self.path):os.mkdir(self.path)'''Description:crawl HTML pagesArgs:NoneReturns:pages : HTML pages'''def crawlPages(self):pages = []for i in range(0 , self.number):url = self.url + ('&pn=%d' % i * self.offset)request = urllib.request.Request(headers = self.headers, url = url)response = urllib.request.urlopen(request)page = response.read().decode('utf-8')pages.append(page)return pages'''Description:extract links from pagesArgs:pages:HTML pagesReturns:linkList:the list of links extract from pages by regular expression'''def extractLinks(self, pages):linkList = []for page in pages:links = re.findall(self.pattern, str(page))for link in links:if link is not None and link != '':linkList.append(link)return linkList'''Description:download picture according to link, and save the picture in the given path and named after filenameArgs:link:link of picturefilename:including path and filename'''def download(self, link, filename):try:print("downloading...")req.urlretrieve(str(link), filename = filename)print("downloaded succesfully")except urllib.ContentTooShortError:print("retrying...")self.download(link, filename)'''Description:download all picture according to links in linkListArgs:linkList:the list of pictures' linksReturns:None'''def downloadAll(self, linkList):for link in linkList:filename = self.path + '/%d.jpg' % linkList.index(link)self.download(link, filename)'''Description:start to downloadArgs:NoneReturns:None'''def startDownload(self):pages = self.crawlPages()linkList = self.extractLinks(pages)self.downloadAll(linkList)if __name__ == "__main__":keyword = input("Please input the keyword of picture you wanna download:")path = './' + input("Please input the path of pictures you wanna save:")number = int(input("Please input the number of page you wanna get:"))downloader = Downloader(keyword, path, number)downloader.startDownload()
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
发布评论