开发一个能够提取论文标题、作者列表、 发表时间,以及 PUBMED 关键字“乳腺癌”的摘要 在2023年6月1日至2023年12月31日的时间窗口内。 ▪ 将检索到的数据保存为 CSV 格式以供以后使用
它无法正常工作,我不知道如何修复它/做什么 - 非常感谢任何帮助! :)
pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup
import csv
# Constants
SEARCH_URL = "https://pubmed.ncbi.nlm.nih.gov/"
QUERY = "Breast Cancer"
START_DATE = "2023/06/01"
END_DATE = "2023/12/31"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}
def fetch_article_links(query, start_date, end_date):
params = {
"term": query,
"mindate": start_date,
"maxdate": end_date
}
response = requests.get(SEARCH_URL, params=params, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
article_links = [a['href'] for a in soup.find_all('a', class_='docsum-title')]
return article_links
def fetch_article_details(article_link):
response = requests.get(article_link, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1', class_='heading-title').text.strip()
authors = [a.text.strip() for a in soup.find_all('a', class_='full-name')]
pub_date = soup.find('span', class_='cit').text.strip()
abstract = soup.find('div', class_='abstract-content').text.strip()
return {
"Title": title,
"Authors": ", ".join(authors),
"Publication Date": pub_date,
"Abstract": abstract
}
def save_to_csv(data, filename='pubmed_breast_cancer.csv'):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Publication Date", "Abstract"])
writer.writeheader()
writer.writerows(data)
def main():
article_links = fetch_article_links(QUERY, START_DATE, END_DATE)
base_url = "https://pubmed.ncbi.nlm.nih.gov"
data = []
for link in article_links:
full_link = f"{base_url}{link}"
try:
article_details = fetch_article_details(full_link)
data.append(article_details)
except Exception as e:
print(f"Error fetching details for {full_link}: {e}")
save_to_csv(data)
if __name__ == "__main__":
main()
试试这个。
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
BASE_URL = "https://pubmed.ncbi.nlm.nih.gov"
QUERY = "Breast Cancer"
START_DATE = "2023/06/01"
END_DATE = "2023/12/31"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0',
}
def fetch_links(query, start_date, end_date):
params = {
'term': f'(({query})) AND (("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication]))',
}
response = requests.get(BASE_URL, params=params, headers=HEADERS)
soup = BeautifulSoup(response.text, "lxml")
return [link["href"] for link in soup.select("a.docsum-title")]
def fetch_details(link):
response = requests.get(link, headers=HEADERS)
soup = BeautifulSoup(response.text, "lxml")
title = soup.select_one("h1.heading-title").text.strip()
authors = [author.text.strip() for author in soup.select("a.full-name")]
pub_date = re.sub(";.*", "", soup.select_one("span.cit").text.strip())
abstract = soup.select_one("div.abstract-content")
if abstract:
abstract = abstract.text.strip()
return {
"title": title,
"authors": ", ".join(authors),
"date": pub_date,
"abstract": abstract
}
if __name__ == "__main__":
links = fetch_links(QUERY, START_DATE, END_DATE)
data = []
for link in links:
# Make into absolute URL.
link = BASE_URL+link
print(link)
try:
details = fetch_details(link)
data.append(details)
except Exception as e:
print(f"Error fetching details for {link}: {e}")
data = pd.DataFrame(data)
data.to_csv("pubmed_breast_cancer.csv", index=False)
正如评论中提到的,请改进您的问题。你至少应该:
这些最少的信息对那些试图回答您的问题的人很有帮助。