我使用python 3.6.5,我的操作系统是macOS 10.13.6。
我正在学习Web Scraping,我想从这个网站上获取数据(https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=)
这是我的代码:
# encoding: utf-8
import requests
from lxml import etree
def parse_list_page():
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest',
}
data = {
'first':'false',
'pn':1,
'kd':'python',
}
response = requests.post(url,headers=headers,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
我很感谢你花时间回答我的问题。
我得到了答案,下面的代码如下:
# encoding: utf-8
import requests
from lxml import etree
import time
def parse_list_page():
url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Upgrade-Insecure-Requests':'1',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control':'no-cache',
'Pragma':'no-cache',
}
response = requests.get(url,headers=headers)
# print(response.text)
r = requests.utils.dict_from_cookiejar(response.cookies)
print(r)
print('='*30)
# r['LGUID'] = r['LGRID']
# r['user_trace_token'] = r['LGRID']
# r['LGSID'] = r['LGRID']
cookies = {
# 'X_MIDDLE_TOKEN':'df7c1d3cfdf279f0caf13df990723620',
# 'JSESSIONID':'ABAAABAAAIAACBI29FE9BDFB6838D8DD69C580E517292C9',
# '_ga':'GA1.2.820168368.1551196380',
# '_gat':'1',
# 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1551196381',
# 'user_trace_token':'20190226235303-99bc357a-39de-11e9-921f-525400f775ce',
# 'LGSID':'20190311094827-c3bc2393-439f-11e9-a15a-525400f775ce',
# 'PRE_UTM':'',
# 'PRE_HOST':'',
# 'PRE_SITE':'',
# 'PRE_LAND':'https%3A%2F%2Fwww.lagou.com%2F',
# 'LGUID':'20190226235303-99bc3944-39de-11e9-921f-525400f775ce',
# '_gid':'GA1.2.1391680888.1552248111',
# 'index_location_city':'%E6%B7%B1%E5%9C%B3',
# 'TG-TRACK-CODE':'index_search',
# 'LGRID':'20190311100452-0ed0525c-43a2-11e9-9113-5254005c3644',
# 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1552269893',
# 'SEARCH_ID':'aae3c38ec76545fc86cd4e23153afe44',
}
cookies.update(r)
print(r)
print('=' * 30)
print(cookies)
print('=' * 30)
headers = {
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'h-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3',
'Connection': 'keep-alive',
}
params = {
'px':'default',
'city':'深圳',
'needAddtionalResult':'false'
}
data = {
'first':'true',
'pn':1,
'kd':'python',
}
url_json = 'https://www.lagou.com/jobs/positionAjax.json'
response = requests.post(url=url_json,headers=headers,params=params,cookies=cookies,data=data)
print(response.json())
def main():
parse_list_page()
if __name__ == '__main__':
main()
我无法获得json作为响应的原因是这里反对Web抓取规则是您在发送请求时需要使用第一个cookie。因此,当您第一次发送请求时,您需要保存cookie,然后更新它以使用您的第二页请求。希望当您遇到此问题时,您将有助于进行网络抓取