我正在尝试网络抓这个网站http://uprera.azurewebsites.net/View_projects.aspx
如何使用本网站:从下拉列表中选择任意值,然后点击搜索,您将看到表格。在表格中点击查看详细信息上的ctrl。它打开到一个新窗口,我试图为每个下拉值webscrape这些链接
当我运行下面的代码时,它会废弃网站但是给我上面提到的网站网址而不是我正在寻找的链接
下面是我的代码:
import requests
from bs4 import BeautifulSoup
import csv
import time
final_data = []
url = "http://uprera.azurewebsites.net/View_projects.aspx"
response = requests.get(url).text
soup = BeautifulSoup(response,"html.parser")
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
for title in soup.select("#ContentPlaceHolder1_DdlprojectDistrict [value]")[:-1]:
search_item = title.text
# print(search_item)
headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
formfields = {'__VIEWSTATE':VIEWSTATE, #Put the value in this variable
'__VIEWSTATEGENERATOR':'4F1A7E70',
'__EVENTVALIDATION':EVENTVALIDATION, #Put the value in this variable
'ctl00$ContentPlaceHolder1$DdlprojectDistrict':search_item, #this is where your city name changes in each iteration
'ctl00$ContentPlaceHolder1$txtProject':'',
'ctl00$ContentPlaceHolder1$btnSearch':'Search'}
#here in form details check agra , i am able to scrape one city only,
# how to loop for all cities
s= requests.session()
res = s.post(url, data=formfields, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
get_list = soup.find_all('option') #gets list of all <option> tag
for element in get_list :
cities = element["value"]
#final.append(cities)
#print(final)
get_details = soup.find_all("table", attrs={"id":"ContentPlaceHolder1_GridView1"})
for details in get_details:
text = details.find_all("tr")[1:]
count = 0
for tds in text:
td = tds.find_all("td")[1]
rera = td.find_all("span")
rnumber = ""
for num in rera:
rnumber = num.text
sublist = []
sublist.append(rnumber)
name = tds.find_all("td")[2]
prj_name = name.find_all("span")
prj = ""
for prjname in prj_name:
prj = prjname.text
sublist.append(prj)
promoter_name = tds.find_all("td")[3]
promoter = promoter_name.find_all("span")
prom = ""
for promname in promoter:
prom = promname.text
sublist.append(prom)
district = tds.find_all("td")[4]
dist = district.find_all("span")
district_name = ""
for districtname in dist:
district_name = districtname.text
sublist.append(district_name)
project_type = tds.find_all("td")[5]
project = project_type.find_all("span")
btn_td = tds.find_all("td")[6]
ip_name = btn_td.find("input").attrs['name']
dct = {}
dct['__VIEWSTATE']=VIEWSTATE
dct['__VIEWSTATEGENERATOR']=formfields['__VIEWSTATEGENERATOR']
dct['__EVENTVALIDATION']=EVENTVALIDATION
dct['ctl00$ContentPlaceHolder1$txtProject'] = ''
dct['ctl00$ContentPlaceHolder1$DdlprojectDistrict'] = formfields['ctl00$ContentPlaceHolder1$DdlprojectDistrict']
dct[ip_name+'binid'] = '6869'
dct[ip_name+'hfFlag'] = 'edit'
dct[ip_name+'ddlPRJ'] = 'Agra'
dct[ip_name+'txtPRJ'] = ''
resp = s.post(url, data=dct, headers=headers)
projectype = ""
for prjtype in project:
projectype = prjtype.text
sublist.append(projectype)
print( resp.url )
sublist.append( resp.url )
final_data.append(sublist)
count += 1
print(count)
filename = "UP_RERA.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])
谁能帮助我呢?我正在解析正确的网址
网址由JavaScript创建 - 但这些网址似乎具有相同的架构
http://uprera.azurewebsites.net/View_Registration_Details.aspx?binid=10996&hfFlag=edit&ddlPRJ=Lucknow&txtPRJ=
有district
- Lucknow
- 以及来自rare
- 10996
的数字 - 所以你可以手动生成它。
代码有变化:
对于测试,我使用if count > 3: break
来限制屏幕上的数据。
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "http://uprera.azurewebsites.net/View_projects.aspx"
s = requests.session()
response = s.get(url)
soup = BeautifulSoup(response.text, "html.parser")
VIEWSTATE = soup.select_one("#__VIEWSTATE")['value']
EVENTVALIDATION = soup.select_one("#__EVENTVALIDATION")['value']
headers= {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
formfields = {
'__VIEWSTATE': VIEWSTATE,
'__VIEWSTATEGENERATOR': '4F1A7E70',
'__EVENTVALIDATION': EVENTVALIDATION,
#'ctl00$ContentPlaceHolder1$DdlprojectDistrict': search_item,
'ctl00$ContentPlaceHolder1$txtProject': '',
'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
}
for title in soup.select("#ContentPlaceHolder1_DdlprojectDistrict [value]")[:-1]:
search_item = title.text
print('\n-----', search_item, '-----\n')
formfields['ctl00$ContentPlaceHolder1$DdlprojectDistrict'] = search_item
response = s.post(url, data=formfields, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
all_options = soup.find_all('option')
for element in all_options:
cities = element["value"]
all_rows = soup.find("table", attrs={"id":"ContentPlaceHolder1_GridView1"}).find_all("tr")[1:]
count = 0
for row in all_rows:
sublist = [search_item]
tds = row.find_all("td")
rera = tds[1].find("span").text
sublist.append(rera)
print('rera:', rera)
name = tds[2].find("span").text
sublist.append(name)
print('name:', name)
promoter_name = tds[3].find("span").text
sublist.append(promoter_name)
print('promoter_name:', promoter_name)
district = tds[4].find("span").text
sublist.append(district)
print('district:', district)
project_type = tds[5].find("span").text
sublist.append(project_type)
print('project_type:', project_type)
rare_id = rera[9:]
print('rare_id:', rare_id)
details_url = 'http://uprera.azurewebsites.net/View_Registration_Details.aspx?binid={}&hfFlag=edit&ddlPRJ={}&txtPRJ='.format(rare_id, district)
sublist.append(details_url)
print('url:', details_url)
final_data.append(sublist)
count += 1
print('count:', count)
# for test exit after 10 results
if count > 3: break
filename = "./UP_RERA.csv"
with open(filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow(['search_item', 'rare', 'name', 'promoter_name', 'district', 'project_type', 'url'])
for row in final_data:
csvfile.writerow(row)