网络抓取工具不提供 CSV 文件输出

问题描述 投票:0回答:1

开发一个能够提取论文标题、作者列表、 发表时间,以及 PUBMED 关键字“乳腺癌”的摘要 在2023年6月1日至2023年12月31日的时间窗口内。 ▪ 将检索到的数据保存为 CSV 格式以供以后使用

它无法正常工作,我不知道如何修复它/做什么 - 非常感谢任何帮助! :)


    pip install requests beautifulsoup4


    import requests
    from bs4 import BeautifulSoup
    import csv

# Constants
    SEARCH_URL = "https://pubmed.ncbi.nlm.nih.gov/"
    QUERY = "Breast Cancer"
    START_DATE = "2023/06/01"
    END_DATE = "2023/12/31"
    HEADERS = {
    "User-Agent": "Mozilla/5.0"
    }

    def fetch_article_links(query, start_date, end_date):
    params = {
        "term": query,
        "mindate": start_date,
        "maxdate": end_date
    }
    response = requests.get(SEARCH_URL, params=params, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')
    article_links = [a['href'] for a in soup.find_all('a', class_='docsum-title')]
    return article_links

    def fetch_article_details(article_link):
    response = requests.get(article_link, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1', class_='heading-title').text.strip()
    authors = [a.text.strip() for a in soup.find_all('a', class_='full-name')]
    pub_date = soup.find('span', class_='cit').text.strip()
    abstract = soup.find('div', class_='abstract-content').text.strip()
    return {
        "Title": title,
        "Authors": ", ".join(authors),
        "Publication Date": pub_date,
        "Abstract": abstract
    }

    def save_to_csv(data, filename='pubmed_breast_cancer.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Publication Date",     "Abstract"])
        writer.writeheader()
        writer.writerows(data)

    def main():
    article_links = fetch_article_links(QUERY, START_DATE, END_DATE)
    base_url = "https://pubmed.ncbi.nlm.nih.gov"
    data = []
    for link in article_links:
        full_link = f"{base_url}{link}"
        try:
            article_details = fetch_article_details(full_link)
            data.append(article_details)
        except Exception as e:
            print(f"Error fetching details for {full_link}: {e}")
    save_to_csv(data)

   if __name__ == "__main__":
    main()
web-scraping pubmed
1个回答
0
投票

试试这个。

import requests
import pandas as pd
import re
from bs4 import BeautifulSoup

BASE_URL = "https://pubmed.ncbi.nlm.nih.gov"

QUERY = "Breast Cancer"
START_DATE = "2023/06/01"
END_DATE = "2023/12/31"

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0',
}

def fetch_links(query, start_date, end_date):
    params = {
        'term': f'(({query})) AND (("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication]))',
    }
    response = requests.get(BASE_URL, params=params, headers=HEADERS)
    soup = BeautifulSoup(response.text, "lxml")    
    return [link["href"] for link in soup.select("a.docsum-title")]


def fetch_details(link):
    response = requests.get(link, headers=HEADERS)

    soup = BeautifulSoup(response.text, "lxml")
    title = soup.select_one("h1.heading-title").text.strip()
    authors = [author.text.strip() for author in soup.select("a.full-name")]
    pub_date = re.sub(";.*", "", soup.select_one("span.cit").text.strip())
    abstract = soup.select_one("div.abstract-content")
    if abstract:
        abstract = abstract.text.strip()

    return {
        "title": title,
        "authors": ", ".join(authors),
        "date": pub_date,
        "abstract": abstract
    }

if __name__ == "__main__":
    links = fetch_links(QUERY, START_DATE, END_DATE)
    data = []
    for link in links:
        # Make into absolute URL.
        link = BASE_URL+link
        print(link)
        try:
            details = fetch_details(link)
            data.append(details)
        except Exception as e:
            print(f"Error fetching details for {link}: {e}")
    data = pd.DataFrame(data)

    data.to_csv("pubmed_breast_cancer.csv", index=False)

正如评论中提到的,请改进您的问题。你至少应该:

  1. 修复代码的格式。
  2. 具体说明预期结果是什么。
  3. 具体说明观察到的结果是什么。

这些最少的信息对那些试图回答您的问题的人很有帮助。

© www.soinside.com 2019 - 2024. All rights reserved.