从github.com中提取,我进一步更新了以下谷歌地图抓取器,这是我需要从抓取的地方获取以下数据,这是马来西亚槟城乔治城的一种特殊食物(Char Kuey Teow)客栈:
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
class PlacesScraper:
def __init__(
self,
headless=True,
load_images=False,
chromedriver_path="chromedriver.exe",
window_size=(700, 900),
):
options = webdriver.ChromeOptions()
if headless:
options.add_argument("--headless")
options.add_argument("--disable-gpu")
if not load_images:
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(
executable_path=chromedriver_path, options=options
)
self.driver.set_window_size(*window_size)
def get_places_data(
self,
search_url,
timeout=10,
max_results=100,
):
assert max_results <= 120, "max_results should be <= 120"
self.driver.get(search_url)
time.sleep(1)
places_card = []
st = time.time()
# scroll to bottom
while True:
if time.time() - st > timeout:
break
elif len(places_card) >= max_results:
break
places_card = self.driver.find_elements(By.CLASS_NAME, "bfdHYd")
if len(places_card) > 0:
for i in places_card[-1:]:
self.driver.execute_script("arguments[0].scrollIntoView();", i)
else:
time.sleep(1)
# process the data
if len(places_card) > 0:
data = []
for i in places_card:
# find anchor tag in parent div using xpath
place_url = i.find_element(By.XPATH, "..//a").get_property("href")
#element = i.find_element(By.CLASS_NAME, 'v4dQwb')
#big_img = element[0].find_element(By.CLASS_NAME, 'n3VNCb')
#images_url = big_img.get_attribute("src")
# Extract the latitude after the "@" character
#coordinates = place_url.split("@")[1].split(",")
latitude = float(place_url.split('!3d')[1].split('!4d')[0])
longitude = float(place_url.split('!4d')[1].split('!16s')[0])
# Extract the name, rating, address & timing
text = i.find_element(By.XPATH, ".//div[4]").text
# remove all the unicode characters and unwanted space
text = re.sub(r"[^\x00-\x7f]", r"", text)
text_raw = text
text = re.sub(r" +", " ", text)
#name, rating, address, timing = text.split("\n")
# Get the photo URL
photo_url = ""
photo_element = i.find_elements(By.XPATH, ".//div[1]/div/a/img")
if photo_element:
style_attribute = photo_element[0].get_attribute("style")
match = re.search(r"url\(\"(https:[^)]+)\"\)", style_attribute)
if match:
photo_url = match.group(1)
# split text by new line and create a dictionary
keys = ["name", "rating", "address", "timing"]
item = dict(zip(keys, text.split("\n")))
if item['rating'] != 'No reviews':
rating = float(item['rating'].split('(')[0])
rating_Count = int(
item['rating'].split('(')[1].split(')')[0].replace(',', ''))
item['rating'] = rating
item.update({"rating count": rating_Count})
item.update({"text": text_raw})
item.update({"url": place_url})
item.update({"latitude": latitude})
item.update({"longitude": longitude})
item.update({"photo url": photo_url})
#item.update({"image url": images_url})
data.append(item)
return data
def close(self):
self.driver.close()
def __del__(self):
self.close()
if __name__ == "__main__":
scraper = PlacesScraper(headless=True, load_images=False)
data = scraper.get_places_data(search_url="https://www.google.com/maps/search/char+koay+teow+near+George+Town,+Penang/@5.4198656,100.3257296,15z/data=!4m4!2m3!5m1!4e3!6e5")
df = pd.DataFrame(data)
我试过添加代码,从网上学到的。但它不起作用。
我找不到的是以下内容。请帮助。