尝试抓取动态网页上的谷歌地图链接,其中一些成功,但另一些则失败

问题描述 投票:0回答:1

我想在一个名为 Timable 的提供公共活动的网站上获取时间和地点信息,它使用 Google 地图显示详细位置,有些活动只有一个位置,我已成功获取链接,但是对于那些有多个链接的人来说,似乎根本没有来自谷歌的链接。这是怎么发生的?

我编写了如下所示的代码,以及四个用于测试的示例事件页面。这些数字代表每个活动的地点数量。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException 
import time
import pandas as pd
import os
import re 

chrome_options = Options()
browser = webdriver.Chrome()


# test examples
# 1 聯和墟社區會堂
url = 'https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022'
# 3 甲辰年中秋綵燈會 2024
# url = 'https://timable.com/hk/zh/event/66a32ffbea8f6c5f5431d608/%E7%94%B2%E8%BE%B0%E5%B9%B4%E4%B8%AD%E7%A7%8B%E7%B6%B5%E7%87%88%E6%9C%832024'
# 6 避風涼茶館社區巡演
# url = 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94'
# 1 Skechers Summer Waterpark
# url = 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92'
browser.get(url)

time.sleep(5)

# -----------------------------------------------------------------------------------------------------------------------
def sanitize_string(value):
    # Define a regex pattern to match illegal characters (e.g., control characters)
    # Only keep printable characters (remove non-printable characters)
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', value)
# -----------------------------------------------------------------------------------------------------------------------
def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""
# -----------------------------------------------------------------------------------------------------------------------
def extract_event_info(url):
    # event = ["", "", "", "", "", ""] # ensure at least an array should be returned
    
    attempts_u = 3
    while attempts_u > 0:
        try:
            browser.get(url)
            time.sleep(5) 

            # Retry to locate parent container to handle Stale Element
            attempts_p = 3
            while attempts_p > 0:
                try: 
                    parent_container = WebDriverWait(browser, 15).until(
                        EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']"))
                    )
                    break  # Break if successful
                except StaleElementReferenceException:
                    attempts_p -= 1
                    if attempts_p == 0:
                        print(f"Error(parent_container) for url: {url}: {e}")
                        return event
                    time.sleep(2)  # Wait before retrying
                except:
                    attempts_p -= 1
                    if attempts_p == 0:
                        print(f"Error(parent_container) for url {url}: {e}")
                        return event
                    time.sleep(2)  # Wait before retrying
                    
            # fully load the parent container to make sure everything wanted is loaded before extraction
            parent_container = WebDriverWait(browser, 15).until(
            EC.visibility_of_element_located((By.XPATH, "//div[@class='chakra-container cha-pmdu9d']")) 
            )

            # box 2
            event_box2 = WebDriverWait(browser, 15).until(
                EC.visibility_of_element_located((By.XPATH, "//div[@id='displayLocation']")) 
                )
            if event_box2: print("event_box2 found!")
            if event_box2:
                # box2_child_box_list = event_box2.find_elements(By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]")
                box2_child_box_list = WebDriverWait(event_box2, 15).until(
                    EC.presence_of_all_elements_located((By.XPATH, ".//div[contains(@class, 'cha-gq6fqh')]"))
                    )
                if box2_child_box_list: print("box2_child_box_list found!")

                # a_tags = event_box2.find_elements(By.XPATH, "//a")
                a_tags = event_box2.find_elements(By.XPATH, "//a[contains(@href,'https://maps.google.com/maps?ll=')]")
                if a_tags: 
                    print("a tags found!")
                    location_div_titles = [box2_child_box.find_element(By.XPATH, ".//p[contains(@class, 'chakra-text cha-722v25')]").text for box2_child_box in box2_child_box_list]
                    print(f"location_div_titles: {location_div_titles}")
                    for a_tag in a_tags: 
                        link = a_tag.get_attribute("href")
                        print(f"link: {link}")
      
            
            # box 1
            event_box1 = parent_container.find_element(By.XPATH, ".//div[contains(@class, 'cha-nm882m')]") # get the box covering all the times and locations
            child_box_list = event_box1.find_elements(By.XPATH, "./div")
            child_box_amt = len(child_box_list)
            # print(f"child_box_amt: {child_box_amt}")
            box1 = []
            for child_box in child_box_list:
                child_box_div_list = child_box.find_elements(By.XPATH, "./div")
                child_box_div_all = []
                child_box_div_time = []
                child_box_div_location = []
                for child_box_div in child_box_div_list:
                    element_class = child_box_div.get_attribute("class")
                    if element_class == 'chakra-stack cha-16yidj1': # time
                        child_box_div_time.append(child_box_div.text)
                    elif element_class == 'chakra-stack cha-1igwmid': # location
                        location_name_text = child_box_div.find_element(By.XPATH, ".//button").text
                        child_box_div_p2 = child_box_div.find_elements(By.XPATH, ".//p[2]")
                        if child_box_div_p2:
                            # detailed_address_text = child_box_div_p2[0].text
                            detailed_address_text = ', ' + child_box_div_p2[0].text
                        else: detailed_address_text = ''
                        location = location_name_text + detailed_address_text

                        # # if "显示位置" is available, add the detailed info of locations for reference
                        # if box2_child_box_list_for_use:
                        #     for i in range(len(box2_child_box_list_for_use)):
                        #         if box2_child_box_list_for_use[i][0] == location_name_text:
                        #             location += ', ' + box2_child_box_list_for_use[i][1] + ', ' + box2_child_box_list_for_use[i][2]
                    
                        child_box_div_location.append(location)
                child_box_div_all.append(child_box_div_time)
                child_box_div_all.append(child_box_div_location)
                location_num = len(child_box_div_location)
                child_box_string = '; '.join(', '.join(map(str, subarray)) for subarray in child_box_div_all)
                box1.append(child_box_string)
            event_time_location = '~'.join(map(str, box1))
            # print(f"event_time_location: {event_time_location}")
            # event[3] = location_num
            # event[4] = event_time_location

            break  # Break if successful

        except StaleElementReferenceException:
            attempts_u -= 1
            if attempts_u == 0:
                print(f"Error(getting url) for url: {url}: {e}")
                return event
            time.sleep(2)  # Wait before retrying
        except Exception as e:
            attempts_u -= 1
            if attempts_u == 0:
                print(f"Error(getting url) for url: {url}: {e}")
                return event
            time.sleep(2)  # Wait before retrying
    
    # return event
    return None

# event= extract_event_info(url)
extract_event_info(url)
# print(event)
# output_file = os.path.join('output', f'output_test_1018.xlsx')  # Output file name in the output folder
# columns = ['Title', 'Type', 'Keyword', 'Location Num', 'Time & Location', 'Description']
# events_df = pd.DataFrame(event, columns=columns)
# events_df = pd.DataFrame([event], columns=columns)
# events_df.to_excel(output_file, index=False)
browser.close()

例如,在测试第一个事件时,得到的链接为: 在此输入图片描述 但对于第二个事件,没有获得链接: 在此输入图片描述

python web-scraping
1个回答
0
投票

您可以使用目标的 API 路由端点来获取所需的输出

{url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29

示例代码:

import requests

def getDetails(url):
    url = f'{url}?_data=routes%2F%24region.%24locale._main.event.%24id.%28%24slug%29'
    resp = requests.get(url).json()
    location_details = resp['event']['sections']

    for i in location_details:
        data = [f"name: {i['location']['name']}", f"map_url: https://maps.google.com/maps?ll={i['location']['coordinate'][0]},{i['location']['coordinate'][1]}&t=m&hl=en-US&gl=US&mapclient=apiv3"]
        print(data)

urls = ['https://timable.com/hk/zh/event/2257473/%E7%84%A1%E6%AF%92-%E6%9C%89%E6%A8%82%E5%9C%A8%E5%8C%97%E5%8D%80-%E5%98%89%E5%B9%B4%E8%8F%AF-2022', 'https://timable.com/hk/zh/event/66bd5f564d998ae07f8356b1/%E9%81%BF%E9%A2%A8%E6%B6%BC%E8%8C%B6%E9%A4%A8%E7%A4%BE%E5%8D%80%E5%B7%A1%E6%BC%94', 'https://timable.com/hk/zh/event/6684296b5c76c0dac9625910/Skechers%E6%88%B6%E5%A4%96%E6%B0%B4%E4%B8%8A%E6%A8%82%E5%9C%92']
for i in urls:
    getDetails(i)

输出示例:

['name: 聯和墟社區會堂', 'map_url: https://maps.google.com/maps?ll=114.140367,22.495468&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 添馬公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.165768,22.281595&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 中山紀念公園', 'map_url: https://maps.google.com/maps?ll=114.144315,22.290303&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 上水花園 第一號', 'map_url: https://maps.google.com/maps?ll=114.130484,22.503538&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 粉嶺聯和趁墟 - 戶外活動空間(魚花園)', 'map_url: https://maps.google.com/maps?ll=114.1427568,22.4978873&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 荃灣公園 中央廣場', 'map_url: https://maps.google.com/maps?ll=114.113423,22.364039&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 黃大仙廣場', 'map_url: https://maps.google.com/maps?ll=114.193813,22.341969&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: JCCAC賽馬會創意藝術中心', 'map_url: https://maps.google.com/maps?ll=114.165719,22.334665&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 觀塘海濱花園', 'map_url: https://maps.google.com/maps?ll=114.216954,22.312821&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 牛棚藝術村 露天空地', 'map_url: https://maps.google.com/maps?ll=114.191326,22.320579&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 荔枝角公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.138414,22.339071&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 風之塔公園 露天劇場', 'map_url: https://maps.google.com/maps?ll=114.1538531,22.2429592&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 香港仔海濱公園 露天廣場', 'map_url: https://maps.google.com/maps?ll=114.15334,22.247818&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 仁愛廣場', 'map_url: https://maps.google.com/maps?ll=113.975603,22.396653&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 天水圍嘉湖銀座廣場', 'map_url: https://maps.google.com/maps?ll=114.003357,22.457727&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 東區文化廣場', 'map_url: https://maps.google.com/maps?ll=114.23025,22.282962&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 西貢海濱公園', 'map_url: https://maps.google.com/maps?ll=114.274504,22.382432&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 九龍公園 拱廊', 'map_url: https://maps.google.com/maps?ll=114.170356,22.300241&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: 茂蘿街7號 公眾休憩空間', 'map_url: https://maps.google.com/maps?ll=114.176814,22.27736&t=m&hl=en-US&gl=US&mapclient=apiv3']
['name: Skechers Summer Waterpark', 'map_url: https://maps.google.com/maps?ll=114.1356492743805,22.507647999322668&t=m&hl=en-US&gl=US&mapclient=apiv3']

请告诉我这是否能解决您的问题

© www.soinside.com 2019 - 2024. All rights reserved.