我似乎无法为第1页以外的任何内容生成输出(一个页面有15个餐厅,这就是我所得到的(只有15个输出)。看起来第一页的输出被第二页替换,依此类推。
我已经尝试在页面范围内添加废品,但仍然只返回15个结果(仅报废一页)。
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
for num in range(1,5):
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'.format(num)
response = requests.get(url,headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "col-s-16 search_results mbot"})
list_tr = top_rest[0].find_all("div",attrs={"class": "js-search-result-li even status 1"})
list_rest =[]
for tr in list_tr:
dataframe ={}
dataframe["1.rest_name"] = (tr.find("a",attrs={"class": "result-title hover_feedback zred bold ln24 fontsize0"})).text.replace('\n', ' ')
dataframe["2.rest_address"] = (tr.find("div",attrs={"class": "col-m-16 search-result-address grey-text nowrap ln22"})).text.replace('\n', ' ')
list_rest.append(dataframe)
list_rest
df = pandas.DataFrame(list_rest)
df.to_csv("zomato_res26.csv",index=False)
我希望得到40多家餐馆的名单和位置的输出列表,但到目前为止我看起来每页只有15家餐馆
更改缩进并将列表创建list_rest
移出循环并在循环中追加到它。此外,将输出的编码更改为encoding='utf-8-sig'
以正确处理存在的字符。您可以使用int(soup.select_one('.pagination-number b:last-child').text)
获取页数。
我还添加了requests.Session()
来重用连接。
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
list_rest =[]
with requests.Session() as s:
for num in range(1,5):
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'.format(num)
response = s.get(url,headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "col-s-16 search_results mbot"})
list_tr = top_rest[0].find_all("div",attrs={"class": "js-search-result-li even status 1"})
for tr in list_tr:
dataframe ={}
dataframe["1.rest_name"] = (tr.find("a",attrs={"class": "result-title hover_feedback zred bold ln24 fontsize0"})).text.replace('\n', ' ')
dataframe["2.rest_address"] = (tr.find("div",attrs={"class": "col-m-16 search-result-address grey-text nowrap ln22"})).text.replace('\n', ' ')
list_rest.append(dataframe)
df = pandas.DataFrame(list_rest)
df.to_csv(r"zomato_res26.csv", sep=',', encoding='utf-8-sig',index = False )
如果要循环所有页面并使用具有列表推导的更快选择器:
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
list_rest =[]
def getInfo(soup):
names = [item.text.strip() for item in soup.select('.result-title')]
addresses = [item.text.strip() for item in soup.select('.search-result-address')]
row = list(zip(names, addresses))
return row
with requests.Session() as s:
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'
response = s.get(url.format(1),headers=headers)
content = response.content
soup = BeautifulSoup(content,"lxml")
numPages = int(soup.select_one('.pagination-number b:last-child').text)
list_rest.append(getInfo(soup))
if numPages > 1:
for page in range(2, numPages + 1):
response = s.get(url.format(page),headers=headers)
content = response.content
soup = BeautifulSoup(content,"lxml")
list_rest.append(getInfo(soup))
final_list = [item for sublist in list_rest for item in sublist]
df = pandas.DataFrame(final_list, columns = ['1.rest_name', '2.rest_address'])
df.to_csv(r"zomato_res26.csv", sep=',', encoding='utf-8-sig',index = False )
如果你不知道最后一页的数字怎么办!!!以下脚本将处理该分页事物。它将解析最后一个页码,然后创建一个循环来遍历它们,以便获取餐馆名称及其相关的电话号码。
import pandas
import requests
from bs4 import BeautifulSoup
url = "https://www.zomato.com/auckland/restaurants?gold_partner=1&page="
def get_content(session,link):
session.headers["User-Agent"] = "Mozilla/5.0"
response = session.get(link)
soup = BeautifulSoup(response.text,"lxml")
dataframe = []
last_page = soup.select_one(".pagination-number b:nth-of-type(2)").text
for item_url in range(1,int(last_page)+1):
res = session.get(f"{link}{item_url}")
sauce = BeautifulSoup(res.text,"lxml")
for elem in sauce.select(".search-card"):
d = {}
d['name'] = elem.select_one("a[data-result-type='ResCard_Name']").get_text(strip=True)
d['phone'] = elem.select_one("a.res-snippet-ph-info").get("data-phone-no-str")
dataframe.append(d)
return dataframe
if __name__ == '__main__':
with requests.Session() as session:
item = get_content(session,url)
df = pandas.DataFrame(item)
df.to_csv("zomato_res26.csv",index=False)