我想提取网站上所有酒店的房间表。 我还将提供一个示例链接。 我还准备了以下代码,
driver = webdriver.Chrome()
driver.get(url)
# -------------------------------------------Click meeting room grid button
try:
meeting_room_grid_button = driver.find_element(By.XPATH, '//*[@id="meetingSpaceView"]/li[2]/label')
driver.execute_script("arguments[0].click();", meeting_room_grid_button)
time.sleep(10)
except NoSuchElementException:
pass
# -------------------------------------------Scroll and Show All Btn
try:
scroll_height = driver.execute_script("return document.body.scrollHeight")
show_all_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CSS_SELECTOR, ".MeetingRoomsGrid__viewAllToggleContainer___2HJRU button")))
ActionChains(driver).move_to_element(show_all_button).click().perform()
time.sleep(13)
except (NoSuchElementException, StaleElementReferenceException, TimeoutException):
pass
# -------------------------------------------Slider
try:
horizontal_bar_width = driver.find_element(By.CSS_SELECTOR,
'div.ScrollbarLayout_face.ScrollbarLayout_faceHorizontal.public_Scrollbar_face').rect[
'width']
slider = driver.find_element(By.CSS_SELECTOR,
'div.ScrollbarLayout_face.ScrollbarLayout_faceHorizontal.public_Scrollbar_face')
except (NoSuchElementException, StaleElementReferenceException):
slider = []
# -------------------------------------------Extract Table_Part1
cell_list1 = []
columns_list1 = []
df_part1 = pd.DataFrame()
for element in driver.find_elements(By.CSS_SELECTOR, 'div.fixedDataTableCellGroupLayout_cellGroup'):
if element.text.split('\n') != ['']:
cell_list1.append(element.text.split('\n'))
if len(cell_list1) != 0:
columns_list1 = ['Meeting rooms'] + cell_list1[1]
df_part1 = pd.DataFrame(columns=columns_list1)
i = 2
while i < len(cell_list1):
for item in cell_list1[i]:
if item.startswith('View details'):
cell_list1[i].remove(item)
row = "'" + ("','".join(cell_list1[i])) + "'"
df_part1.loc[row] = cell_list1[i] + cell_list1[i + 1]
i = i + 2
df_part1.insert(0, "title", title)
# -------------------------------------------Move Slider
try:
if slider != []:
ActionChains(driver).click_and_hold(slider).move_by_offset(600, 0).release().perform()
driver.execute_script("window.scrollTo(0, 0)")
while True:
try:
Next_button = driver.find_element(By.CSS_SELECTOR,
'span.Icons-All__right___2HnMG.Icons-All__icon___3TnCZ.MeetingRoomsGrid__navIcon___cXWdw[aria-label="Next"]')
ActionChains(driver).move_to_element(Next_button).click().perform()
time.sleep(60)
except:
break
except (NoSuchElementException, StaleElementReferenceException):
pass
# -------------------------------------------Extract Table_Part2
cell_list2 = []
columns_list2 = []
df_part2 = pd.DataFrame()
for element in driver.find_elements(By.CSS_SELECTOR, 'div.fixedDataTableCellGroupLayout_cellGroup'):
if element.text.split('\n') != ['']:
cell_list2.append(element.text.split('\n'))
if len(cell_list2) != 0:
print(cell_list2)
if len(cell_list2[0]) != 1:
if len(cell_list2[0]) ==2:
del cell_list2[0]
cell_list2.insert(0, ['Meeting rooms'])
else:
cell_list2.insert(0, ['Meeting rooms'])
if ['Meeting rooms'] in cell_list2[1]:
del cell_list2[1]
if len(cell_list2[1]) != len(cell_list2[3]):
del cell_list2[1][0:(len(cell_list2[1]) - len(cell_list2[3]))]
columns_list2 = ['Meeting rooms'] + cell_list2[1]
df_part2 = pd.DataFrame(columns=columns_list2)
j = 2
print(cell_list2)
while j < len(cell_list2):
for item in cell_list2[j]:
if item.startswith('View details'):
cell_list2[j].remove(item)
row = "'" + ("','".join(cell_list2[j])) + "'"
df_part2.loc[row] = cell_list2[j] + cell_list2[j + 1]
j = j + 2
df_part2.insert(0, "title", title)
# -------------------------------------------Extract Table_Part 3
script = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//script[contains(.,'{"id"')]")))
pattern = r'{"id":"([^"]+)"'
script_text = html.unescape(script.get_property('innerText'))
links = re.findall(pattern, script_text.split('"badges"')[0])
roomlinks = []
for l in links:
substring = url[:url.find("?")]
roomlinks.append(f"{substring}/meetingRoom-{l}")
room_list = []
for link in roomlinks[1:]:
print(link)
driver.get(link)
time.sleep(10)
roomtitle = driver.find_element(By.CSS_SELECTOR, '.MeetingRoomDetailPage__meetingRoomNameText___3k-3T').text
roomAbout = driver.find_element(By.CSS_SELECTOR,
'div.Grid__col___gBFRS.MeetingRoomDetailPage__descriptionContainer___36ypu.Grid__col-xs-12___2eswl.Grid__col-xs-offset-0___aTK-4.Grid__col-sm-6___CaUu-.Grid__col-sm-offset-0___tTaDk.Grid__col-md-offset-0___2SImQ.Grid__col-lg-offset-0___Atw2J').text
try:
roomImg = driver.find_element(By.XPATH,
"//IMG[@data-cvent-id='roomProfile-row0-col1-styleImageContainer-imageContainer-canvas-picture-img']").get_attribute(
'src')
except:
roomImg = ""
try:
roomFloorPlan = driver.find_element(By.CSS_SELECTOR,
"a.MeetingRoomDetailPage__downloadFloorPlans___1E1OW").get_attribute(
'href')
except:
roomFloorPlan = ""
try:
roomFloorPlanImg = driver.find_element(By.XPATH,
"//IMG[@data-cvent-id='roomProfile-row0-col1-styleImageContainer-imageContainer-canvas-img']").get_attribute(
'src')
except:
roomFloorPlanImg = ""
roomAmenitiesVar = driver.find_elements(By.CSS_SELECTOR,
'h5.MeetingRoomDetailPage__amenityListTitle___1jM5Q')
roomAmenitiesAm = driver.find_elements(By.CSS_SELECTOR,
'div.Grid__col___gBFRS.MeetingRoomDetailPage__amenityList___3XlFG.Grid__col-xs-12___2eswl.Grid__col-xs-offset-0___aTK-4.Grid__col-sm-6___CaUu-.Grid__col-sm-offset-0___tTaDk.Grid__col-md-offset-0___2SImQ.Grid__col-lg-offset-0___Atw2J ul')
roomAmenities = {}
for v, a in zip(roomAmenitiesVar, roomAmenitiesAm):
roomAmenities[v.text] = a.text
row_values = [roomtitle, roomAbout, roomImg, roomFloorPlan, roomFloorPlanImg]
room_list.append(row_values)
df_part3 = pd.DataFrame(room_list, columns=['Meeting rooms', 'Description', 'Image', 'Floor Plan',
'Floor Plan Image'])
# ---------------------------------------------------------------------
finalldf = pd.merge(df_part1, df_part3, on='Meeting rooms', how='outer')
# -------------------------------------------Extract Common Col list
common_col = ["title"] + list(set(finalldf.columns).intersection(columns_list2))
if df_part1.empty:
pass
else:
finalDF = pd.merge(finalldf, df_part2, on=common_col, how='outer')
但我有两个问题:
而且我写的代码非常繁重且效率低下,因为我是这个领域的新手。是否可以指导我如何优化这段代码来解决这些问题?或者有没有更合适的方法来提取这些类型的表?
事实上,Selenium 可能不是这个项目的最佳选择。
在决定使用哪些工具进行抓取之前,弄清楚网页如何加载必要的数据及其存储位置是非常重要。
在您的情况下,它全部存储在 javascript 变量中,作为 html/脚本的一部分,名称为
venueProfile
。这意味着,只需打开网页,您就已经拥有完整的数据集。因此,剩下的唯一步骤是解析数据以使其可用。
这是如何做到的。显然,这可以根据您的需求进一步扩展。
import json
import requests
import re
import html
# Getting the webpage content
url = 'https://www.cvent.com/venues/en-US/dubai/hotel/jw-marriott-marquis-hotel-dubai/venue-c6d701db-da99-4133-a3de-608d446d3446'
req = requests.get(url)
# Parsing the data which is stored in "venueProfile" variable of javascript
venue_profile_string = re.search('(?<="venueProfile":").+?(?=",)', req.text).group(0)
# Dealing with escape characters
data_string = html.unescape(venue_profile_string).replace('\\\"', '"')
# Converting to python dict
data = json.loads(data_string)
最后,您可以根据需要解压数据。以下是一些输出示例。
>>> len(data['meetingRooms'])
47
>>> data['meetingRooms'][9]['name']
'Dubai Ballroom'
>>> data['meetingRooms'][9]['totalSpace']
{'imperialValue': 14983.369751245924, 'metricValue': 1392.0}
>>> data['meetingRooms'][9]['maxCapacity']
1380