传递URL(python请求)和webscrape中的参数

问题描述 投票:0回答:1

我正在尝试网络抓这个网站http://uprera.azurewebsites.net/View_projects.aspx

如何使用本网站:从下拉列表中选择任意值,然后点击搜索,您将看到表格。在表格中点击查看详细信息上的ctrl。它打开到一个新窗口,我试图为每个下拉值webscrape这些链接

当我运行下面的代码时,它会废弃网站但是给我上面提到的网站网址而不是我正在寻找的链接

下面是我的代码:

import requests
from bs4 import BeautifulSoup
import csv
import time

final_data = []

url = "http://uprera.azurewebsites.net/View_projects.aspx"
response = requests.get(url).text
soup = BeautifulSoup(response,"html.parser")

VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']

for title in soup.select("#ContentPlaceHolder1_DdlprojectDistrict [value]")[:-1]:
    search_item = title.text
    # print(search_item)
    headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
              'Content-Type':'application/x-www-form-urlencoded',
              'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

    formfields = {'__VIEWSTATE':VIEWSTATE,  #Put the value in this variable
                '__VIEWSTATEGENERATOR':'4F1A7E70',
                '__EVENTVALIDATION':EVENTVALIDATION, #Put the value in this variable
                'ctl00$ContentPlaceHolder1$DdlprojectDistrict':search_item, #this is where your city name changes in each iteration
                'ctl00$ContentPlaceHolder1$txtProject':'',
                'ctl00$ContentPlaceHolder1$btnSearch':'Search'}

    #here in form details check agra , i am able to scrape one city only,
    # how to loop for all cities
    s=  requests.session()
    res = s.post(url, data=formfields, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")
    VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
    EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']

    get_list  = soup.find_all('option')   #gets list of all <option> tag
    for element in get_list :
        cities = element["value"]
        #final.append(cities)
        #print(final)

    get_details = soup.find_all("table", attrs={"id":"ContentPlaceHolder1_GridView1"})

    for details in get_details:
        text = details.find_all("tr")[1:]
        count = 0
        for tds in text:
            td = tds.find_all("td")[1]
            rera = td.find_all("span")
            rnumber = ""
            for num in rera:
                rnumber = num.text
                sublist = []
                sublist.append(rnumber)
            name = tds.find_all("td")[2]
            prj_name = name.find_all("span")
            prj = ""
            for prjname in prj_name:
                prj = prjname.text
                sublist.append(prj)
            promoter_name = tds.find_all("td")[3]
            promoter = promoter_name.find_all("span")
            prom = ""
            for promname in promoter:
                prom = promname.text
                sublist.append(prom)
            district = tds.find_all("td")[4]
            dist = district.find_all("span")
            district_name = ""
            for districtname in dist:
                district_name = districtname.text
                sublist.append(district_name)
            project_type = tds.find_all("td")[5]
            project = project_type.find_all("span")
            btn_td = tds.find_all("td")[6]
            ip_name = btn_td.find("input").attrs['name']
            dct = {}
            dct['__VIEWSTATE']=VIEWSTATE
            dct['__VIEWSTATEGENERATOR']=formfields['__VIEWSTATEGENERATOR']
            dct['__EVENTVALIDATION']=EVENTVALIDATION
            dct['ctl00$ContentPlaceHolder1$txtProject'] = ''
            dct['ctl00$ContentPlaceHolder1$DdlprojectDistrict'] = formfields['ctl00$ContentPlaceHolder1$DdlprojectDistrict']
            dct[ip_name+'binid'] = '6869'
            dct[ip_name+'hfFlag'] = 'edit'
            dct[ip_name+'ddlPRJ'] = 'Agra'
            dct[ip_name+'txtPRJ'] = ''

            resp = s.post(url, data=dct, headers=headers)
            projectype = ""
            for prjtype in project:
                projectype = prjtype.text
                sublist.append(projectype)
            print( resp.url )
            sublist.append( resp.url )
            final_data.append(sublist)
            count += 1
            print(count)

filename = "UP_RERA.csv"
with open("./"+filename, "w") as csvfile:
    csvfile = csv.writer(csvfile, delimiter=",")
    csvfile.writerow("")
    for i in range(0, len(final_data)):
        csvfile.writerow(final_data[i])

谁能帮助我呢?我正在解析正确的网址

beautifulsoup python-requests python-3.6
1个回答
0
投票

网址由JavaScript创建 - 但这些网址似乎具有相同的架构

http://uprera.azurewebsites.net/View_Registration_Details.aspx?binid=10996&hfFlag=edit&ddlPRJ=Lucknow&txtPRJ= 

district - Lucknow - 以及来自rare - 10996的数字 - 所以你可以手动生成它。


代码有变化:

对于测试,我使用if count > 3: break来限制屏幕上的数据。

import requests
from bs4 import BeautifulSoup
import csv

final_data = []

url = "http://uprera.azurewebsites.net/View_projects.aspx"

s = requests.session()

response = s.get(url)
soup = BeautifulSoup(response.text, "html.parser")

VIEWSTATE = soup.select_one("#__VIEWSTATE")['value']
EVENTVALIDATION = soup.select_one("#__EVENTVALIDATION")['value']

headers= {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

formfields = {
    '__VIEWSTATE': VIEWSTATE,
    '__VIEWSTATEGENERATOR': '4F1A7E70',
    '__EVENTVALIDATION': EVENTVALIDATION,
    #'ctl00$ContentPlaceHolder1$DdlprojectDistrict': search_item,
    'ctl00$ContentPlaceHolder1$txtProject': '',
    'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
}

for title in soup.select("#ContentPlaceHolder1_DdlprojectDistrict [value]")[:-1]:
    search_item = title.text
    print('\n-----', search_item, '-----\n')

    formfields['ctl00$ContentPlaceHolder1$DdlprojectDistrict'] = search_item

    response = s.post(url, data=formfields, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    all_options  = soup.find_all('option')
    for element in all_options:
        cities = element["value"]

    all_rows = soup.find("table", attrs={"id":"ContentPlaceHolder1_GridView1"}).find_all("tr")[1:]

    count = 0

    for row in all_rows:
        sublist = [search_item]

        tds = row.find_all("td")

        rera = tds[1].find("span").text
        sublist.append(rera)
        print('rera:', rera)

        name = tds[2].find("span").text
        sublist.append(name)
        print('name:', name)

        promoter_name = tds[3].find("span").text
        sublist.append(promoter_name)
        print('promoter_name:', promoter_name)

        district = tds[4].find("span").text
        sublist.append(district)
        print('district:', district)

        project_type = tds[5].find("span").text
        sublist.append(project_type)
        print('project_type:', project_type)

        rare_id = rera[9:]
        print('rare_id:', rare_id)
        details_url = 'http://uprera.azurewebsites.net/View_Registration_Details.aspx?binid={}&hfFlag=edit&ddlPRJ={}&txtPRJ='.format(rare_id, district)

        sublist.append(details_url)
        print('url:', details_url)

        final_data.append(sublist)

        count += 1
        print('count:', count)

        # for test exit after 10 results
        if count > 3: break


filename = "./UP_RERA.csv"
with open(filename, "w") as csvfile:
    csvfile = csv.writer(csvfile, delimiter=",")
    csvfile.writerow(['search_item', 'rare', 'name', 'promoter_name', 'district', 'project_type', 'url'])
    for row in final_data:
        csvfile.writerow(row)
© www.soinside.com 2019 - 2024. All rights reserved.