# __author: han-zhang
# date: 2019/8/15 17:15
from bs4 import BeautifulSoup
import urllib.request, time
def get_request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('gbk')
return content
def parse_content(content):
# 生成soup对象
soup = BeautifulSoup(content, 'lxml')
bdiv = soup.find('div', class_='dw_table')
# print(bdiv)
get_text(bdiv)
def get_text(bdiv):
bb_list = bdiv.find_all('div', class_='el')
# print(bb_list)
# print(len(bb_list))
bb_list.remove(bb_list[0])
fp = open('51job.txt', 'a', encoding='utf8')
count = 0
for bb in bb_list:
count += 1
print('正在下载第%s条' % count)
all_span = bb.find_all('span')
# print(all_span)
# 职位名称
job_name1 = all_span[0]
job_name = job_name1.text.replace('\n', '').lstrip().rstrip()
# print(job_name)
# 公司名称
bname1 = all_span[1]
bname = bname1.text
# 工作地点
job_where1 = all_span[-3]
job_where = job_where1.string
# 薪资
salary1 = all_span[-2]
salary = salary1.string
# 发布日期
time1 = all_span[-1]
times = time1.string
fp.write(
str(job_name) + '\t' + str(bname) + '\t' + str(job_where) + '\t' + str(salary) + '\t' + str(times) + '\n')
print('结束下载第%s条' % count)
time.sleep(2)
fp.close()
def main():
page = int(input('请输入你要取爬的页数:'))
for p in range(1, page + 1):
url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,python,2,' + str(p) + '.html?'
# 构建请求对象
request = get_request(url)
# 得到响应
content = get_content(
request)
# 通过bs4解析网页
parse_content(content)
if __name__ == '__main__':
main()