我能够成功抓取该网站,但它无法正常工作。这是我得到的输出,它是空字典。如果有人可以确认它是否是 HTML 标签或拒绝指定,我将非常感激。我在此处包含了终端输出的图像:
'''
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import pandas as pd
##import urllib.parse
API_KEY = '7bbcbb39-029f-4075-97bc-6b57b6e9e68b'
def get_scrapeops_url(url):
payload = {'api_key': API_KEY, 'url': url}
proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload)
return proxy_url
r = requests.get(get_scrapeops_url('https://bitbox.swiss/coins/'))
response = r.text
#r = requests.get('https://bitbox.swiss/coins/')
#list to store scraped data
data = []
#
soup = BeautifulSoup(response,'html.parser')
result = soup.find('div',class_ = 'coins-table')
# parse through the website's html
name = soup.find('strong',style = 'cursor: auto;')
ticker = soup.find('span',style = 'cursor: auto;')
#Store data in a dictionary using key value pairs
d = {'name':name.text if name else None,'ticker':ticker.text if ticker else None}
data.append(d)
#convert to a pandas df
data_df = pd.DataFrame(data)
data_df.to_csv("coins_scrape.csv", index=False)
print(data_df)
'''
货币名称是通过 JavaScript 从外部 URL 加载的。您可以使用
requests
模块来模拟它:
import requests
import pandas as pd
url = 'https://bitbox.swiss/coins/tokens.json'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'}
data = requests.get(url, headers=headers).json()
df = pd.DataFrame(data)
print(df)
打印:
address name unit
0 e41d2489571d322189246dafa5ebde1f4699f498 0x Protocol ZRX
1 b6ed7644c69416d67b522e20bc294a9a9b405b31 0xBitcoin 0xBTC
2 111111111117dc0aa78b770fa6a738034120c302 1inch Network 1INCH
3 a4ef4b0b23c1fc81d3f9ecf93510e64f58a4a016 1MillionNFTs 1MIL
4 2c9c19ce3b15ae77c6d80aec3c1194cfd6f7f3fa 2crazyNFT 2CRZ
5 ff44b5719f0b77a9951636fc5e69d3a1fc9e7d73 4ART Coin 4ART
6 8888801af4d980682e47f1a9036e589479e835c5 88mph MPH
7 feea0bdd3d07eb6fe305938878c0cadbfa169042 8PAY 8PAY
8 7fc66500c84a76ad7e9c93437bfc5ac33e2ddae9 Aave AAVE
...