为什么我无法通过使用网络抓取从本网站获得结果

问题描述 投票:-1回答:1

我使用python 3.6.5,我的操作系统是macOS 10.13.6。

我正在学习Web Scraping,我想从这个网站上获取数据(https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=

这是我的代码:

# encoding: utf-8

import requests
from lxml import etree

def parse_list_page():
    url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        'X-Anit-Forge-Code':'0',
        'X-Anit-Forge-Token':None,
        'X-Requested-With':'XMLHttpRequest',
    }
    data = {
        'first':'false',
        'pn':1,
        'kd':'python',
    }
    response = requests.post(url,headers=headers,data=data)
    print(response.json())

def main():
    parse_list_page()

if __name__ == '__main__':
    main()

我很感谢你花时间回答我的问题。

python-3.x web-scraping python-requests
1个回答
0
投票

我得到了答案,下面的代码如下:

# encoding: utf-8

import requests
from lxml import etree
import time

def parse_list_page():
    url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3'


    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
        'Host':'www.lagou.com',
        'Referer':'https://www.lagou.com/',
        'Connection':'keep-alive',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
        'Upgrade-Insecure-Requests':'1',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Cache-Control':'no-cache',
        'Pragma':'no-cache',
    }

    response = requests.get(url,headers=headers)
    # print(response.text)
    r = requests.utils.dict_from_cookiejar(response.cookies)
    print(r)
    print('='*30)
    # r['LGUID'] = r['LGRID']
    # r['user_trace_token'] = r['LGRID']
    # r['LGSID'] = r['LGRID']

    cookies = {
        # 'X_MIDDLE_TOKEN':'df7c1d3cfdf279f0caf13df990723620',
        # 'JSESSIONID':'ABAAABAAAIAACBI29FE9BDFB6838D8DD69C580E517292C9',
        # '_ga':'GA1.2.820168368.1551196380',
        # '_gat':'1',
        # 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1551196381',
        # 'user_trace_token':'20190226235303-99bc357a-39de-11e9-921f-525400f775ce',
        # 'LGSID':'20190311094827-c3bc2393-439f-11e9-a15a-525400f775ce',
        # 'PRE_UTM':'',
        # 'PRE_HOST':'',
        # 'PRE_SITE':'',
        # 'PRE_LAND':'https%3A%2F%2Fwww.lagou.com%2F',
        # 'LGUID':'20190226235303-99bc3944-39de-11e9-921f-525400f775ce',
        # '_gid':'GA1.2.1391680888.1552248111',
        # 'index_location_city':'%E6%B7%B1%E5%9C%B3',
        # 'TG-TRACK-CODE':'index_search',
        # 'LGRID':'20190311100452-0ed0525c-43a2-11e9-9113-5254005c3644',
        # 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1552269893',
        # 'SEARCH_ID':'aae3c38ec76545fc86cd4e23153afe44',

    }
    cookies.update(r)

    print(r)
    print('=' * 30)
    print(cookies)
    print('=' * 30)

    headers = {
        'Origin':'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'X-Anit-Forge-Token': None,
        'X-Requested-With': 'XMLHttpRequest',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'h-CN,zh;q=0.9,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3',
        'Connection': 'keep-alive',
    }

    params = {
        'px':'default',
        'city':'深圳',
        'needAddtionalResult':'false'
    }

    data = {
        'first':'true',
        'pn':1,
        'kd':'python',
    }
    url_json = 'https://www.lagou.com/jobs/positionAjax.json'
    response = requests.post(url=url_json,headers=headers,params=params,cookies=cookies,data=data)
    print(response.json())






def main():
    parse_list_page()





if __name__ == '__main__':
    main()

我无法获得json作为响应的原因是这里反对Web抓取规则是您在发送请求时需要使用第一个cookie。因此,当您第一次发送请求时,您需要保存cookie,然后更新它以使用您的第二页请求。希望当您遇到此问题时,您将有助于进行网络抓取

© www.soinside.com 2019 - 2024. All rights reserved.