我想在一个名为 Timable 的提供公共活动的网站上获取时间和地点信息,它使用 Google 地图显示详细位置,有些活动只有一个位置,我已成功获取链接,但是对于那些有多个链接的人来说,似乎根本没有来自谷歌的链接。这是怎么发生的?
我编写了如下所示的代码,以及四个用于测试的示例事件页面。这些数字代表每个活动的地点数量。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
import time
import pandas as pd
import os
import re
chrome_options = Options()
browser = webdriver.Chrome()
# test examples
# 1 聯和墟社區會堂
url = 'https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022'
# 3 甲辰年中秋綵燈會 2024
# url = 'https://timable.com/hk/zh/event/66a32ffbea8f6c5f5431d608/%E7%94%B2%E8%BE%B0%E5%B9%B4%E4%B8%AD%E7%A7%8B%E7%B6%B5%E7%87%88%E6%9C%832024'
# 6 避風涼茶館社區巡演
# url = 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94'
# 1 Skechers Summer Waterpark
# url = 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92'
browser.get(url)
time.sleep(5)
# -----------------------------------------------------------------------------------------------------------------------
def sanitize_string(value):
# Define a regex pattern to match illegal characters (e.g., control characters)
# Only keep printable characters (remove non-printable characters)
return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', value)
# -----------------------------------------------------------------------------------------------------------------------
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
# -----------------------------------------------------------------------------------------------------------------------
def extract_event_info(url):
# event = ["", "", "", "", "", ""] # ensure at least an array should be returned
attempts_u = 3
while attempts_u > 0:
try:
browser.get(url)
time.sleep(5)
# Retry to locate parent container to handle Stale Element
attempts_p = 3
while attempts_p > 0:
try:
parent_container = WebDriverWait(browser, 15).until(
EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']"))
)
break # Break if successful
except StaleElementReferenceException:
attempts_p -= 1
if attempts_p == 0:
print(f"Error(parent_container) for url: {url}: {e}")
return event
time.sleep(2) # Wait before retrying
except:
attempts_p -= 1
if attempts_p == 0:
print(f"Error(parent_container) for url {url}: {e}")
return event
time.sleep(2) # Wait before retrying
# fully load the parent container to make sure everything wanted is loaded before extraction
parent_container = WebDriverWait(browser, 15).until(
EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']"))
)
# box 2
event_box2 = WebDriverWait(browser, 15).until(
EC.visibility_of_element_located((By.XPATH, "//div[@id='displayLocation']"))
)
if event_box2: print("event_box2 found!")
if event_box2:
# box2_child_box_list = event_box2.find_elements(By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]")
box2_child_box_list = WebDriverWait(event_box2, 15).until(
EC.presence_of_all_elements_located((By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]"))
)
if box2_child_box_list: print("box2_child_box_list found!")
# a_tags = event_box2.find_elements(By.XPATH, "//a")
a_tags = event_box2.find_elements(By.XPATH, "//a[contains(@href,'https://maps.google.com/maps?ll=')]")
if a_tags:
print("a tags found!")
location_div_titles = [box2_child_box.find_element(By.XPATH, ".//p[contains(@class, 'chakra-text cha-722v25')]").text for box2_child_box in box2_child_box_list]
print(f"location_div_titles: {location_div_titles}")
for a_tag in a_tags:
link = a_tag.get_attribute("href")
print(f"link: {link}")
# box 1
event_box1 = parent_container.find_element(By.XPATH, ".//div[contains(@class, 'cha-nm882m')]") # get the box covering all the times and locations
child_box_list = event_box1.find_elements(By.XPATH, "./div")
child_box_amt = len(child_box_list)
# print(f"child_box_amt: {child_box_amt}")
box1 = []
for child_box in child_box_list:
child_box_div_list = child_box.find_elements(By.XPATH, "./div")
child_box_div_all = []
child_box_div_time = []
child_box_div_location = []
for child_box_div in child_box_div_list:
element_class = child_box_div.get_attribute("class")
if element_class == 'chakra-stack cha-16yidj1': # time
child_box_div_time.append(child_box_div.text)
elif element_class == 'chakra-stack cha-1igwmid': # location
location_name_text = child_box_div.find_element(By.XPATH, ".//button").text
child_box_div_p2 = child_box_div.find_elements(By.XPATH, ".//p[2]")
if child_box_div_p2:
# detailed_address_text = child_box_div_p2[0].text
detailed_address_text = ', ' + child_box_div_p2[0].text
else: detailed_address_text = ''
location = location_name_text + detailed_address_text
# # if "显示位置" is available, add the detailed info of locations for reference
# if box2_child_box_list_for_use:
# for i in range(len(box2_child_box_list_for_use)):
# if box2_child_box_list_for_use[i][0] == location_name_text:
# location += ', ' + box2_child_box_list_for_use[i][1] + ', ' + box2_child_box_list_for_use[i][2]
child_box_div_location.append(location)
child_box_div_all.append(child_box_div_time)
child_box_div_all.append(child_box_div_location)
location_num = len(child_box_div_location)
child_box_string = '; '.join(', '.join(map(str, subarray)) for subarray in child_box_div_all)
box1.append(child_box_string)
event_time_location = '~'.join(map(str, box1))
# print(f"event_time_location: {event_time_location}")
# event[3] = location_num
# event[4] = event_time_location
break # Break if successful
except StaleElementReferenceException:
attempts_u -= 1
if attempts_u == 0:
print(f"Error(getting url) for url: {url}: {e}")
return event
time.sleep(2) # Wait before retrying
except Exception as e:
attempts_u -= 1
if attempts_u == 0:
print(f"Error(getting url) for url: {url}: {e}")
return event
time.sleep(2) # Wait before retrying
# return event
return None
# event= extract_event_info(url)
extract_event_info(url)
# print(event)
# output_file = os.path.join('output', f'output_test_1018.xlsx') # Output file name in the output folder
# columns = ['Title', 'Type', 'Keyword', 'Location Num', 'Time & Location', 'Description']
# events_df = pd.DataFrame(event, columns=columns)
# events_df = pd.DataFrame([event], columns=columns)
# events_df.to_excel(output_file, index=False)
browser.close()
您可以使用目标的 API 路由端点来获取所需的输出
{url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29
import requests
def getDetails(url):
url = f'{url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29'
resp = requests.get(url).json()
location_details = resp['event']['sections']
for i in location_details:
data = [f"name: {i['location']['name']}", f"map_url: https://maps.google.com/maps?ll={i['location']['coordinate'][0]},{i['location']['coordinate'][1]}&t=m&hl=en-US&gl=US&mapclient=apiv3"]
print(data)
urls = ['https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022', 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94', 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92']
for i in urls:
getDetails(i)
['name: 聯和墟社區會堂', 'map_url: https://maps.google.com/maps?ll=114.140367,22.495468&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 添馬公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.165768,22.281595&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 中山紀念公園', 'map_url: https://maps.google.com/maps?ll=114.144315,22.290303&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 上水花園 第一號', 'map_url: https://maps.google.com/maps?ll=114.130484,22.503538&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 粉嶺聯和趁墟 - 戶外活動空間(魚花園)', 'map_url: https://maps.google.com/maps?ll=114.1427568,22.4978873&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 荃灣公園 中央廣場', 'map_url: https://maps.google.com/maps?ll=114.113423,22.364039&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 黃大仙廣場', 'map_url: https://maps.google.com/maps?ll=114.193813,22.341969&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: JCCAC賽馬會創意藝術中心', 'map_url: https://maps.google.com/maps?ll=114.165719,22.334665&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 觀塘海濱花園', 'map_url: https://maps.google.com/maps?ll=114.216954,22.312821&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 牛棚藝術村 露天空地', 'map_url: https://maps.google.com/maps?ll=114.191326,22.320579&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 荔枝角公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.138414,22.339071&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 風之塔公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.1538531,22.2429592&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 香港仔海濱公園 露天廣場', 'map_url: https://maps.google.com/maps?ll=114.15334,22.247818&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 仁愛廣場', 'map_url: https://maps.google.com/maps?ll=113.975603,22.396653&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 天水圍嘉湖銀座廣場', 'map_url: https://maps.google.com/maps?ll=114.003357,22.457727&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 東區文化廣場', 'map_url: https://maps.google.com/maps?ll=114.23025,22.282962&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 西貢海濱公園', 'map_url: https://maps.google.com/maps?ll=114.274504,22.382432&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 九龍公園 拱廊', 'map_url: https://maps.google.com/maps?ll=114.170356,22.300241&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 茂蘿街7號 公眾休憩空間', 'map_url: https://maps.google.com/maps?ll=114.176814,22.27736&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: Skechers Summer Waterpark', 'map_url: https://maps.google.com/maps?ll=114.1356492743805,22.507647999322668&t=m&hl=en-US&gl=US&mapclient=apiv3']
请告诉我这是否能解决您的问题