我有以下汤:
<a href="some_url">next</a>
<span class="class">...</span>
从中我想提取href,“some_url” 我想提取href,“some_url”
以及此页面上列出的页面的完整列表:https://www.catholic-hierarchy.org/diocese/laa.html
注意: 有很多子页面的链接:我需要解析它们。目前:获取所有数据: - 教区 - 网址 -描述 -联系数据 -ETC。等。
下面的示例将获取教区的所有 URL,获取有关每个教区的一些信息并创建最终的数据帧。为了加速进程多重处理。使用池:
但是等等:如何在没有多处理支持的情况下让这个刮刀运行!?我想在 Colab 中运行它 - 因此需要摆脱多处理功能。
如何实现这一点..!?
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
# is there Next Page button?
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
+ next_page["href"]
)
else:
break
return dioceses_urls
def get_diocese_info(url):
print(url)
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")
data = {
"Title 1": soup.h1.get_text(strip=True),
"Title 2": soup.h2.get_text(strip=True),
"Title 3": soup.h3.get_text(strip=True) if soup.h3 else "-",
"URL": url,
}
li = soup.find(
lambda tag: tag.name == "li"
and "type of jurisdiction:" in tag.text.lower()
and tag.find() is None
)
if li:
for l in li.find_previous("ul").find_all("li"):
t = l.get_text(strip=True, separator=" ")
if ":" in t:
k, v = t.split(":", maxsplit=1)
data[k.strip()] = v.strip()
# get other info about the diocese
# ...
return data
if __name__ == "__main__":
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
}
# get main sections:
url = "https://www.catholic-hierarchy.org/diocese/laa.html"
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
main_sections = [url]
for a in soup.select("a[target='_parent']"):
main_sections.append(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
all_data, dioceses_urls = [], set()
with Pool() as pool:
# get all dioceses urls:
for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
dioceses_urls.update(urls)
# get info about all dioceses:
for info in pool.imap_unordered(get_diocese_info, dioceses_urls):
all_data.append(info)
# create dataframe from the info about dioceses
df = pd.DataFrame(all_data).sort_values("Title 1")
# save it to csv file
df.to_csv("data.csv", index=False)
print(df.head().to_markdown())
更新:如果我运行colab上的脚本,我们会得到什么结果:
https://www.catholic-hierarchy.org/diocese/laa.htmlhttps://www.catholic-hierarchy.org/diocese/lab.html
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "<ipython-input-1-f5ea34a0190f>", line 21, in get_dioceses_urls
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1403, in select_one
value = self.select(selector, limit=1)
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1528, in select
'Only the following pseudo-classes are implemented: nth-of-type.')
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
"""
The above exception was the direct cause of the following exception:
NotImplementedError Traceback (most recent call last)
<ipython-input-1-f5ea34a0190f> in <module>
81 with Pool() as pool:
82 # get all dioceses urls:
---> 83 for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
84 dioceses_urls.update(urls)
85
/usr/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
746 if success:
747 return value
--> 748 raise value
749
750 __next__ = next # XXX
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
以下是以异步方式获取该信息的一种方法(应该适用于 Colab 笔记本)。我从网站的不同部分获得了教区网址(结构化视图 - 世界地区)。我希望那里的教区计数与字母列表中的计数相匹配。
from httpx import Client, AsyncClient, Limits
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
from datetime import datetime
import asyncio
import nest_asyncio
nest_asyncio.apply()
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
big_df_list = []
def all_dioceses():
dioceses = []
root_links = [f'https://www.catholic-hierarchy.org/diocese/qview{x}.html' for x in range(1, 8)]
with Client(headers=headers, timeout=60.0, follow_redirects=True) as client:
for x in root_links:
r = client.get(x)
soup = bs(r.text)
soup.select_one('ul#menu2').decompose()
for link in soup.select('ul > li > a'):
dioceses.append('https://www.catholic-hierarchy.org/diocese/' + link.get('href'))
return dioceses
# print(all_dioceses())
async def get_diocese_info(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
try:
r = await client.get(url)
soup = bs(r.text)
d_name = soup.select_one('h1[align="center"]').get_text(strip=True)
info_table = soup.select_one('div[id="d1"] > table')
d_bishops = ' | '.join([x.get_text(strip=True) for x in info_table.select('td')[0].select('li')])
d_extra_info = ' | '.join([x.get_text(strip=True) for x in info_table.select('td')[1].select('li')])
big_df_list.append((d_name, d_bishops, d_extra_info, url))
print('done', d_name)
except Exception as e:
print(url, e)
async def scrape_dioceses():
start_time = datetime.now()
tasks = asyncio.Queue()
for x in all_dioceses():
tasks.put_nowait(get_diocese_info(x))
async def worker():
while not tasks.empty():
await tasks.get_nowait()
await asyncio.gather(*[worker() for _ in range(100)])
end_time = datetime.now()
duration = end_time - start_time
print('diocese scraping took', duration)
asyncio.run(scrape_dioceses())
df = pd.DataFrame(big_df_list, columns = ['Name', 'Bishops', 'Info', 'Url'])
print(df)
终端结果:
done Eparchy of Mississauga (Syro-Malabar)
done Eparchy of Mar Addai of Toronto (Chaldean)
done Eparchy of Saint-Sauveur de Montr�al (Melkite Greek)
done Diocese of Calgary
done Archdiocese of Winnipeg
[...]
diocese scraping took 0:03:02.366096
Name Bishops Info Url
0 Eparchy of Mississauga (Syro-Malabar) JoseKalluvelil, Bishop Type of Jurisdiction: Eparchy | Elevated:22 December2018 | Immediately Subject to the Holy See | Syro-Malabar Catholic Church of the Chaldean Tradition | Country:Canada | Mailing Address: Syro-Malabar Apostolic Exarchate, 6630 Turner Valley Rd., Mississauga, ON L5V 2P1, Canada | Telephone: (905)858-8200 | Fax: 858-8208 https://www.catholic-hierarchy.org/diocese/dmism.html
1 Eparchy of Mar Addai of Toronto (Chaldean) Robert SaeedJarjis, Bishop | Bawai (Ashur)Soro, Bishop Emeritus Type of Jurisdiction: Eparchy | Erected:10 June2011 | Immediately Subject to the Holy See | Chaldean Catholic Church of the Chaldean Tradition | Country:Canada | Conference Region:Ontario | Mailing Address: 2 High Meadow Place, Toronto, ON M9L 2Z5, Canada | Telephone: (416)746-5816 | Fax: 746-5850 https://www.catholic-hierarchy.org/diocese/dtoch.html
2 Eparchy of Saint-Sauveur de Montr�al (Melkite Greek) MiladJawish, B.S., Bishop Type of Jurisdiction: Eparchy | Elevated:1 September1984 | Immediately Subject to the Holy See | Melkite Greek Catholic Church of the Byzantine Tradition | Country:Canada | Conference Region:Quebec | Web Site:http://www.melkite.com/ | Mailing Address: 10025 boul. de l'Arcadie, Montreal, QC H4N 2S1, Canada | Telephone: (514)272.6430 | Fax: 202.1274 https://www.catholic-hierarchy.org/diocese/dmome.html
3 Diocese of Calgary William TerrenceMcGrattan, Bishop | Frederick BernardHenry, Bishop Emeritus Type of Jurisdiction: Diocese | Erected:30 November1912 | Metropolitan: Archdiocese ofEdmonton | Rite: Latin (or Roman) | Province: Alberta | Country:Canada | Square Kilometers: 110,500 (42,680 Square Miles) | Conference Region:West (Ouest) | Catholic Directory Abbreviation: Cal | Official Web Site:http://www.calgarydiocese.ca/ | Mailing Address: Catholic Pastoral Centre, Room 290, The Iona Building, 120-17th Avenue S.W., Calgary, AB T2S 2T2, Canada | Telephone: (403)218-5528 | Fax: 264-0272 https://www.catholic-hierarchy.org/diocese/dcalg.html
4 Archdiocese of Winnipeg Richard JosephGagnon, Archbishop | James VernonWeisgerber, Archbishop Emeritus Type of Jurisdiction: Archdiocese | Erected:4 December1915 | Immediately Subject to the Holy See | Rite: Latin (or Roman) | Province: Manitoba | Country:Canada | Square Kilometers: 116,405 (44,961 Square Miles) | Conference Region:West (Ouest) | Catholic Directory Abbreviation: W | Official Web Site:http://www.archwinnipeg.ca/ | Mailing Address: Chancery Office, 1495 Pembina Highway, Winnipeg, MB R3T 2C6, Canada | Telephone: (204)452-2227 | Fax: 475-4409 https://www.catholic-hierarchy.org/diocese/dwinn.html
... ... ... ... ...
2619 Archiepiscopal Exarchate of Krym (Ukrainian) Vacant | Makariy BohdanLeniv, O.S.B.M., Apostolic Administrator | MykhayloBubniy, C.SS.R., Archiepiscopal Administrator Type of Jurisdiction: Archiepiscopal Exarchate | Split:13 February2014 | Metropolitan: Archeparchy ofKyiv-Halyč {Kiev} (Ukrainian) | Ukrainian Catholic Church of the Byzantine Tradition | Country:Ukraine | Mailing Address: vul. Schmidta 22/12, 65000 Odessa, Ukraina | Telephone: (0482)32.58.90 | Fax: 32.58.89 https://www.catholic-hierarchy.org/diocese/dkrym.html
2620 Diocese of Lutsk VitaliySkomarovskyi, Bishop | MarkijanTrofym’yak, Bishop Emeritus Type of Jurisdiction: Diocese | Split:28 October1925 | Metropolitan: Archdiocese ofLviv | Rite: Latin (or Roman) | Country:Ukraine | Square Kilometers: 40,190 (15,523 Square Miles) | Official Web Site:http://catholic.volyn.ua/ | Mailing Address: Kuria Diecezjalna, vul. Katedralna 17, 43016 Lutsk, Ukraina | Telephone: (0332)72.15.32 | Fax: (same) https://www.catholic-hierarchy.org/diocese/dluts.html
2621 Diocese of Stockholm AndersArborelius, O.C.D., Cardinal, Bishop Type of Jurisdiction: Diocese | Elevated:29 June1953 | Immediately Subject to the Holy See | Rite: Latin (or Roman) | Country:Sweden | Square Kilometers: 450,295 (173,926 Square Miles) | Official Web Site:https://www.katolskakyrkan.se | Mailing Address: Katolska Biskopsambetet, Gotgatan 68, P.O. Box 4114, S-102 62 Stockholm, Sverige | Telephone: (08)462.66.02 | Fax: 702.05.55 https://www.catholic-hierarchy.org/diocese/dstos.html
2622 Archeparchy of Diarbekir (Amida) (Chaldean) RamziGarmou, Ist. del Prado, Archbishop Type of Jurisdiction: Archeparchy | Elevated:3 January1966 | Chaldean Catholic Church of the Chaldean Tradition | Country:Turkey | Mailing Address: Archeveche Chaldeen, Hamalbasi Caddesi 20, Galatasaray, 34435 Beyoglu, Istanbul, Turkiye | Telephone: (0212)252.34.49 | Fax: (same) https://www.catholic-hierarchy.org/diocese/ddiar.html
2623 Eparchy of Kolomyia (Ukrainian) VasylIvasyuk, Bishop Type of Jurisdiction: Eparchy | Split:12 September2017 | Metropolitan: Archeparchy ofIvano-Frankivsk [Stanislaviv] (Ukrainian) | Ukrainian Catholic Church of the Byzantine Tradition | Country:Ukraine | Square Kilometers: 14,000 (5,407 Square Miles) | Official Web Site:https://kolugcc.org.ua | Mailing Address: vul. Ivana Franka 29, 78200 Kolomyia, Ukraina | Telephone: (06891)19.707 https://www.catholic-hierarchy.org/diocese/dkolo.html
2624 rows × 4 columns
如您所见,此代码将在大约 3 分钟内提取 2.6k 教区的完整信息,同时使用的资源远少于多处理或多线程。
您需要安装以下内容(安装或升级,只需在colab笔记本中一一运行这些命令即可):
pip install -U asyncio
pip install -U nest-asyncio
pip install -U httpx
pip install -U bs4
pip install -U pandas
我还导入了 re,以防您想要一一选择信息位(司法管辖区、传统、地址、网站等),每个信息都在 try/ except 块中,以解释丢失的信息,并相应地扩展列表/数据框。上述所有包都可以在 https://pypi.org/ 上找到,并有文档记录。
在google colab上运行脚本的问题是它目前只支持python 3.7,不支持最新版本的beautifulsoup,所以你的
a:has
运算符不受支持,我已经将其替换为所有a
上的循环标签,速度稍慢,但代码可以在 google colab 上运行,并且不需要删除多处理,但如果您确实需要删除多处理,那么您应该将您的函数转换为协程,并按照@的建议使用 asyncio 将它们作为任务运行鸭嘴兽巴里。
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
# is there Next Page button?
next_page = None
for a in soup.find_all('a'):
if a.img:
if a.img["alt"] == "[Next Page]":
next_page = a
break
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
+ next_page["href"]
)
else:
break
return dioceses_urls