我想对这个网站进行网络抓取https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/以检索成员姓名、入伍日期和退役日期。但是在我编写并运行下面的代码后,它只将标题保存到 csv 文件中。
这些是我的代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Define the URL
url = "https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the relevant section containing the members' information
content = soup.find('div', class_='entry-content')
# Initialize a list to store the data
data = []
# Loop through the paragraphs and extract dates
paragraphs = content.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
if 'Enlistment date:' in text and 'Discharge date:' in text:
lines = text.split('\n')
for line in lines:
if 'Enlistment date:' in line and 'Discharge date:' in line:
parts = line.split(' - ')
if len(parts) == 2:
name = parts[0].strip()
enlistment_date = parts[1].split('Enlistment date: ')[1].split(',')[0].strip()
discharge_date = parts[1].split('Discharge date: ')[1].strip()
data.append([name, enlistment_date, discharge_date])
# Convert the data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Name', 'Enlistment Date', 'Discharge Date'])
# Save the DataFrame to a CSV file
df.to_csv('exo_military_dates.csv', index=False)
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
预期输出:存储到csv文件中的数据应包含成员姓名,成员入伍日期和退役日期
我可能会使用正则表达式来解析入伍/退役日期:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def get_text(title):
out, p = [], title.find_next_sibling("p")
while p:
out.append(p.text)
p = p.find_next_sibling()
if not (p and p.name == "p"):
break
return " ".join(out)
print(f"{'NAME':<20} {'START':<20} {'END':<20}")
print("-" * 62)
for h3 in soup.find_all(name="h3", string=re.compile(r"^\d+")):
name = h3.text.split(maxsplit=1)[-1]
text = get_text(h3)
m = re.search(
r"(?:enlisted|enlistment|started|began).*?(\S+ \d+, \d{4})(?:.*(?:discharge|back|finish).*?(\S+ \d+, \d+|\S+ \d{4}))?",
text,
)
start, end = m[1], ("-" if not m[2] else m[2])
print(f"{name:<20} {start:<20} {end:<20}")
打印:
NAME START END
--------------------------------------------------------------
Xiumin May 7, 2019 December 6, 2020
D.O July 1, 2019 January 25, 2020
Suho May 14, 2020 February 14, 2022
Chen October 26, 2020 April 25, 2022
Chanyeol March 29, 2021 September 2022
Baekhyun May 6, 2021 February 5, 2023
Kai May 11, 2023 May 11, 2025
Sehun December 21, 2023 -