Google Map Selenium Scraper - 缩略图/照片网址和其他信息

问题描述 投票:0回答:0

从github.com中提取,我进一步更新了以下谷歌地图抓取器,这是我需要从抓取的地方获取以下数据,这是马来西亚槟城乔治城的一种特殊食物(Char Kuey Teow)客栈:

  • 姓名
  • 经纬度
  • 评分和评分计数
  • 地址和联系电话
  • 代表照片/缩略图网址
  • 它的网址
  • 在包含关键字搜索的地方/餐厅/咖啡馆的评论:“Char Kuey Teow”
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

class PlacesScraper:
    def __init__(
        self,
        headless=True,
        load_images=False,
        chromedriver_path="chromedriver.exe",
        window_size=(700, 900),
    ):
        options = webdriver.ChromeOptions()

        if headless:
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")

        if not load_images:
            prefs = {"profile.managed_default_content_settings.images": 2}
            options.add_experimental_option("prefs", prefs)

        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(
            executable_path=chromedriver_path, options=options
        )
        self.driver.set_window_size(*window_size)

    def get_places_data(
        self,
        search_url,
        timeout=10,
        max_results=100,
        ):
        
        assert max_results <= 120, "max_results should be <= 120"

        self.driver.get(search_url)
        time.sleep(1)

        places_card = []
        st = time.time()

        # scroll to bottom
        while True:
            if time.time() - st > timeout:
                break
            elif len(places_card) >= max_results:
                break

            places_card = self.driver.find_elements(By.CLASS_NAME, "bfdHYd")
            if len(places_card) > 0:
                for i in places_card[-1:]:
                    self.driver.execute_script("arguments[0].scrollIntoView();", i)
            else:
                time.sleep(1)

        # process the data
        if len(places_card) > 0:
            data = []
            for i in places_card:
                # find anchor tag in parent div using xpath
                place_url = i.find_element(By.XPATH, "..//a").get_property("href")
                #element = i.find_element(By.CLASS_NAME, 'v4dQwb')
                #big_img = element[0].find_element(By.CLASS_NAME, 'n3VNCb')
                #images_url = big_img.get_attribute("src")
                
                # Extract the latitude after the "@" character
                #coordinates = place_url.split("@")[1].split(",")
                latitude = float(place_url.split('!3d')[1].split('!4d')[0])
                longitude = float(place_url.split('!4d')[1].split('!16s')[0])
                
                # Extract the name, rating, address & timing
                text = i.find_element(By.XPATH, ".//div[4]").text
                
                # remove all the unicode characters and unwanted space
                text = re.sub(r"[^\x00-\x7f]", r"", text)
                text_raw = text
                text = re.sub(r" +", " ", text)
                #name, rating, address, timing = text.split("\n")
                
                # Get the photo URL
                photo_url = ""
                photo_element = i.find_elements(By.XPATH, ".//div[1]/div/a/img")
                if photo_element:
                    style_attribute = photo_element[0].get_attribute("style")
                    match = re.search(r"url\(\"(https:[^)]+)\"\)", style_attribute)
                    if match:
                        photo_url = match.group(1)
                
                # split text by new line and create a dictionary
                keys = ["name", "rating", "address", "timing"]
                item = dict(zip(keys, text.split("\n")))
                if item['rating'] != 'No reviews':
                    rating = float(item['rating'].split('(')[0])
                    rating_Count = int(
                        item['rating'].split('(')[1].split(')')[0].replace(',', ''))
                    item['rating'] = rating
                    item.update({"rating count": rating_Count})
                item.update({"text": text_raw})
                item.update({"url": place_url})
                item.update({"latitude": latitude})
                item.update({"longitude": longitude})
                item.update({"photo url": photo_url})
                #item.update({"image url": images_url})
                data.append(item)
            return data

    def close(self):
        self.driver.close()

    def __del__(self):
        self.close()


if __name__ == "__main__":
    scraper = PlacesScraper(headless=True, load_images=False)
    data = scraper.get_places_data(search_url="https://www.google.com/maps/search/char+koay+teow+near+George+Town,+Penang/@5.4198656,100.3257296,15z/data=!4m4!2m3!5m1!4e3!6e5")
    df = pd.DataFrame(data)

我试过添加代码,从网上学到的。但它不起作用。

我找不到的是以下内容。请帮助。

  • 关键字搜索的代表照片/缩略图网址:“Char Kuey Teow”
  • 它的网址
  • 在包含关键字搜索的地方/餐厅/咖啡馆的评论:“Char Kuey Teow”
google-maps selenium-webdriver web-scraping url
© www.soinside.com 2019 - 2024. All rights reserved.