登录    关于
马育民老师的博客

马育民的博客

QQ:65242847

python requests+BeautifulSoup4抓取拉勾网数据


import requests

import json
from bs4 import BeautifulSoup

url="https://www.lagou.com/zhaopin/Java/?labelWords=label"


headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }


# 路径中是\\
f=open("C:\\Users\\mym\\Desktop\\jz\\3班\\20201019\\招聘信息.txt","w")

def get(url):

    #发送get请求    模拟(假装、伪装)浏览器,向服务器发送get请求(request),返回响应 response (html、css、js、json)
    resp = requests.get(url,headers=headers)

    soup=BeautifulSoup(resp.text,'html.parser')
    div_list=soup.select('div[class="list_item_top"]')

    print("li标签的数量:",len(div_list))


    for item in div_list:
        title=item.select("h3")[0].text
        # money=item.select("span[class='money']")[0].text
        money_edu=item.select("div[class='li_b_l']")[0].text.strip()
        # 根据换行符\n拆分时,不能写\\n
        array=money_edu.split("\n")
        edu_array=array[1].split("/")
        # print(title,",薪资:",array[0],",经验:",edu_array[0],",学历:",edu_array[1])
        line=title+","+array[0]+","+edu_array[0]+","+edu_array[1]+"\n"
        f.write(line)

    # 分析下一页的url
    next_pages=soup.select("a[class='page_no']")
    if next_pages:
        next_page_url=next_pages[-1].get("href")
        print("next_page_url:",next_page_url)
        get(next_page_url)

get(url)
# print(resp.text)

原文出处:https://malaoshi.top/show_1EF6WguAQ8DZ.html