如何使用0(1)内存的lxml?

问题描述 投票:0回答:1

我正在尝试使用 ~72G XML 文件。我想将其转换为 CSV。

这是我假设在后台使用迭代器的代码,因为我可能已经在有关 lxml 的地方读过它。

from lxml import etree
import csv

file_name = "data/discogs_20231001_releases.xml"
# file_name = "data/sample.xml"


def handle_artists(artists):
    ret = []
    for artist in artists:
        artist_dict = {}
        for artist_tag in artist:
            if artist_tag.tag == "id":
                artist_dict["id"] = artist_tag.text
            elif artist_tag.tag == "name":
                artist_dict["name"] = artist_tag.text
        ret.append(artist_dict)
    return ret


events = ("start", "end")
context = etree.iterparse(file_name, events=events)

with open("data/discogs_20231001_releases.csv", "w") as f:
    w = csv.DictWriter(f=f, fieldnames=["release_id", "release_title"], delimiter="\t")
    w.writeheader()

    for action, elem in context:
        if action == "start" and elem.tag == "release":
            release_id = elem.get("id")
            release = {"release_id": release_id}
            for child in elem:
                tag = child.tag
                if tag == "title":
                    release["release_title"] = child.text
                elif tag == "artists":
                    # release["artists"] = handle_artists(child)
                    pass
            if release.get("release_title") != None:
                w.writerow(release)

当我运行它时,我得到:

Job 1, 'python release.py' terminated by signal SIGKILL (Forced quit)

我不确定我是否在使用 XML 解析器或 CSV 编写器时犯了错误。

检查内存分析器是否可以准确地告诉我哪一个是罪魁祸首。

Line #    Mem usage    Increment  Occurrences   Line Contents
=============================================================
    49     25.8 MiB     25.8 MiB           1   @profile
    50                                         def process_event(elem):
    51     25.8 MiB      0.0 MiB           1       release = {}
    52     25.8 MiB      0.0 MiB           1       if elem.tag == "release":
    53                                                 release_id = elem.get("id")
    54                                                 release = {"release_id": release_id}
    55                                                 for child in elem:
    56                                                     tag = child.tag
    57                                                     if tag == "title":
    58                                                         release["release_title"] = child.text
    59                                                     elif tag == "artists":
    60                                                         # release["artists"] = handle_artists(child)
    61                                                         pass
    62     25.8 MiB      0.0 MiB           1       return release
Line #    Mem usage    Increment  Occurrences   Line Contents
=============================================================
    49    176.0 MiB    176.0 MiB           1   @profile
    50                                         def process_event(elem):
    51    176.0 MiB      0.0 MiB           1       release = {}
    52    176.0 MiB      0.0 MiB           1       if elem.tag == "release":
    53                                                 release_id = elem.get("id")
    54                                                 release = {"release_id": release_id}
    55                                                 for child in elem:
    56                                                     tag = child.tag
    57                                                     if tag == "title":
    58                                                         release["release_title"] = child.text
    59                                                     elif tag == "artists":
    60                                                         # release["artists"] = handle_artists(child)
    61                                                         pass
    62    176.0 MiB      0.0 MiB           1       return release

建议使用 fast_iter 的解决方案执行以下操作:

Python(99102,0x202e0e080) malloc: *** error for object 0x1338b7fc0: pointer being freed was not allocated
Python(99102,0x202e0e080) malloc: *** set a breakpoint in malloc_error_break to debug
fish: Job 1, 'python release.py' terminated by signal SIGABRT (Abort)
python lxml
1个回答
0
投票

这是 SAX-y 重新实现;它只准确存储您在任何给定时刻正在解析的信息。

from xml.sax import parse, ContentHandler


class DiscogsFileHandler(ContentHandler):
    def __init__(self, *, write_release):
        super().__init__()
        self.current_release = None
        self.current_artist = None
        self.write_release = write_release
        self.tag_stack = []
        self.char_buffer = ""

    @property
    def tag_path(self) -> tuple:
        return tuple(self.tag_stack)

    def startElement(self, name: str, attrs):
        self.tag_stack.append(name)
        path = self.tag_path
        if path == ("releases", "release"):
            assert not self.current_release
            self.current_release = {**dict(attrs), "artists": []}
        elif path == ("releases", "release", "artists", "artist"):
            assert not self.current_artist
            self.current_artist = dict(attrs)
            self.current_release["artists"].append(self.current_artist)

    def endElement(self, name: str):
        path = self.tag_path
        if path == ("releases", "release"):
            self.write_release(self.current_release)
            self.current_release = None
        if path == ("releases", "release", "title"):
            self.current_release["title"] = self.char_buffer
        if path == ("releases", "release", "artists", "artist", "name"):
            self.current_artist["name"] = self.char_buffer
        if path == ("releases", "release", "artists", "artist", "id"):
            self.current_artist["id"] = self.char_buffer
        if path == ("releases", "release", "artists", "artist"):
            self.current_artist = None
        assert self.tag_stack.pop() == name
        self.char_buffer = ""

    def characters(self, content: str):
        self.char_buffer += content


def write_release(rel: dict):
    print("Got release:", rel)
    # Your CSV writing code here...


parse("discogs.xml", DiscogsFileHandler(write_release=write_release))

输出是(例如;它不写入 CSV)

Got release: {'id': '1', 'status': 'Accepted', 'artists': [{'id': '1', 'name': 'The Persuader'}], 'title': 'Stockholm'}
Got release: {'id': '2', 'status': 'Accepted', 'artists': [{'id': '2', 'name': 'Mr. James Barth & A.D.'}], 'title': "Knockin' Boots (Vol 2 Of 2)"}
Got release: {'id': '3', 'status': 'Accepted', 'artists': [{'id': '3', 'name': 'Josh Wink'}], 'title': 'Profound Sounds Vol. 1'}
Got release: {'id': '4', 'status': 'Accepted', 'artists': [{'id': '21', 'name': 'Faze Action'}], 'title': 'Moving Cities'}
Got release: {'id': '5', 'status': 'Accepted', 'artists': [{'id': '22', 'name': 'Datacide'}], 'title': 'Flowerhead'}
Got release: {'id': '6', 'status': 'Accepted', 'artists': [{'id': '2', 'name': 'Mr. James Barth & A.D.'}], 'title': "Knockin' Boots (Vol 1 Of 2)"}
Got release: {'id': '7', 'status': 'Accepted', 'artists': [{'id': '28', 'name': 'Moonchildren'}], 'title': 'Moonchildren EP'}
Got release: {'id': '8', 'status': 'Accepted', 'artists': [{'id': '29', 'name': 'Sweet Abraham'}], 'title': 'Spreading Outward EP'}
Got release: {'id': '9', 'status': 'Accepted', 'artists': [{'id': '33', 'name': 'Blue Six'}], 'title': 'Pure'}
Got release: {'id': '10', 'status': 'Accepted', 'artists': [{'id': '36', 'name': 'Lovetronic'}], 'title': 'You Are Love'}

(我不想下载完整的文件,所以我下载了

curl -fSL 'https://discogs-data-dumps.s3-us-west-2.amazonaws.com/data/2023/discogs_20231001_releases.xml.gz' | head -c 10000000 | gzcat | xmllint --recover - > discogs.xml

至少抓住其中一些...)

© www.soinside.com 2019 - 2024. All rights reserved.