python爬取58同城二手房信息
58同城二手房信息爬虫
首先观察网页审核元素,查看网页html,选取所需信息,title、baseinfo、price
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import csv
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf8')
s = requests.session()
def getdata(soup):
ul = soup.find('ul',{'class':'house-list-wrap'})
li = ul.find_all('li')
for l in li:
data = []
info = l.find(class_ = 'list-info')
title = info.find(class_ = 'title').get_text(strip = True)
ts = title.split(',')
t = ''.join(ts)
data.append(t.replace('\n','').replace('\r',''))
baseinfo = info.find_all(class_ = 'baseinfo')
for span in baseinfo[0]:
s = str(span.string).replace('\n','').replace(' ','')
data.append(s)
axs = baseinfo[1].span.find_all(name = 'a')#小区信息
if len(axs)== 3:
for a in axs:
ax = str(a.string).replace('\n','').replace(' ','').replace('\r','')
data.append(ax)
else:
ax = baseinfo[1].span.get_text('|',strip = True)
ax = str(ax).replace('\n','').replace(' ','').replace('\r','')
data.append(ax)
data.append('Na')
data.append('Na')
price = l.find(class_ = 'price')
data.append(price.find(class_ = 'sum').text.replace('\n',''))
data.append(price.find(class_ = 'unit').text.replace('\n',''))
with open("file_.csv",'a') as f:
f.write(codecs.BOM_UTF8)
f_csv = csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL)
if data != '':
while '' in data:
data.remove('')
f_csv.writerow(data)
f.close()
i = 1
while (i < 70):
url = '' + str(i) + '/'
r = s.get(url)
soup = BeautifulSoup(r.text,'lxml')
getdata(soup)
i = i + 1
代码运行结果:
第一次写的爬虫,很多地方不是很完善,望高手指教。。。
python爬取58同城二手房信息
58同城二手房信息爬虫
首先观察网页审核元素,查看网页html,选取所需信息,title、baseinfo、price
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import csv
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf8')
s = requests.session()
def getdata(soup):
ul = soup.find('ul',{'class':'house-list-wrap'})
li = ul.find_all('li')
for l in li:
data = []
info = l.find(class_ = 'list-info')
title = info.find(class_ = 'title').get_text(strip = True)
ts = title.split(',')
t = ''.join(ts)
data.append(t.replace('\n','').replace('\r',''))
baseinfo = info.find_all(class_ = 'baseinfo')
for span in baseinfo[0]:
s = str(span.string).replace('\n','').replace(' ','')
data.append(s)
axs = baseinfo[1].span.find_all(name = 'a')#小区信息
if len(axs)== 3:
for a in axs:
ax = str(a.string).replace('\n','').replace(' ','').replace('\r','')
data.append(ax)
else:
ax = baseinfo[1].span.get_text('|',strip = True)
ax = str(ax).replace('\n','').replace(' ','').replace('\r','')
data.append(ax)
data.append('Na')
data.append('Na')
price = l.find(class_ = 'price')
data.append(price.find(class_ = 'sum').text.replace('\n',''))
data.append(price.find(class_ = 'unit').text.replace('\n',''))
with open("file_.csv",'a') as f:
f.write(codecs.BOM_UTF8)
f_csv = csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL)
if data != '':
while '' in data:
data.remove('')
f_csv.writerow(data)
f.close()
i = 1
while (i < 70):
url = '' + str(i) + '/'
r = s.get(url)
soup = BeautifulSoup(r.text,'lxml')
getdata(soup)
i = i + 1
代码运行结果:
第一次写的爬虫,很多地方不是很完善,望高手指教。。。
发布评论