如何删除第一个标记何时相同的标记在xml中相互覆盖

问题描述 投票:-1回答:1

你好朋友,当我有两个或三个标签<word>互相影响我的文件读取像下面的例子

<word wordid="&lt;bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
<word wordid="&lt;bon__1" value="إبْن البَلَد" synsetid="&lt;ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
<word wordid="&lt;bonap_1" value="إبْنَة" synsetid="&lt;ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
<word wordid="&lt;bonu__1" value="إبْنُ عُرْس" synsetid="&lt;ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
<word wordid="&lt;borAhAm__1" value="إبْراهام لينْكون" synsetid="&lt;iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035" />

我想要做的是删除所有的单词标签,并保持标签所遵循的那个,以产生输出

<word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035" />

但是,在下一行中附加了一个更大的文件视图,其中正确的结果应该是没有第一行的任何想法

<word wordid="&lt;borAhAm__1" value="إبْراهام لينْكون" synsetid="&lt;iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
<word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
<form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035" />
<word wordid="&lt;botaz~a_1" value="إبْتَزَّ" synsetid="&lt;ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
<form value="بزز" wordid="&lt;botaz~a_1" type="root" authorshipid="12036" />
xml python-3.7
1个回答
1
投票

你可以使用xpath in lxml选择任何word当第一个兄弟是另一个word并将其删除。

例...

XML输入(input.xml)

<?xml version="1.0" encoding="UTF-8"?>
<doc>
    <test>
        <word wordid="&lt;bon_Aljanuwb_1" value="إبْن الجَنُوب" synsetid="Aljanuwbiy_n1AR" frequency="1" corpus="manchester20060717" authorshipid="12030" />
        <word wordid="&lt;bon__1" value="إبْن البَلَد" synsetid="&lt;ibon_Albalad_n1AR" frequency="" corpus="" authorshipid="12031" />
        <word wordid="&lt;bonap_1" value="إبْنَة" synsetid="&lt;ibonap_n2AR" frequency="1" corpus="manchester20060717" authorshipid="12032" />
        <word wordid="&lt;bonu__1" value="إبْنُ عُرْس" synsetid="&lt;ibonu_Euros_n1AR" frequency="" corpus="" authorshipid="12033" />
        <word wordid="&lt;borAhAm__1" value="إبْراهام لينْكون" synsetid="&lt;iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
        <word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
        <form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035" />
    </test>
    <test>
        <word wordid="&lt;borAhAm__1" value="إبْراهام لينْكون" synsetid="&lt;iborAhAm_lynokwn_n1AR" frequency="" corpus="" authorshipid="12034" />
        <word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035" />
        <form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035" />
        <word wordid="&lt;botaz~a_1" value="إبْتَزَّ" synsetid="&lt;ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036" />
        <form value="بزز" wordid="&lt;botaz~a_1" type="root" authorshipid="12036" />
    </test>
</doc>

蟒蛇

from lxml import etree

tree = etree.parse("input.xml")

for to_remove in tree.xpath("//word[following-sibling::*[1][self::word]]"):
    to_remove.getparent().remove(to_remove)

tree.write("output.xml", encoding="utf-8", xml_declaration=True)

XML输出(output.xml)

<?xml version='1.0' encoding='UTF-8'?>
<doc>
    <test>
        <word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
        <form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035"/>
    </test>
    <test>
        <word wordid="&lt;botahaja_1" value="إبْتَهَجَ" synsetid="fariHa_v1AR" frequency="" corpus="" authorshipid="12035"/>
        <form value="بهج" wordid="&lt;botahaja_1" type="root" authorshipid="12035"/>
        <word wordid="&lt;botaz~a_1" value="إبْتَزَّ" synsetid="&lt;ibotaz~a_v1AR" frequency="" corpus="" authorshipid="12036"/>
        <form value="بزز" wordid="&lt;botaz~a_1" type="root" authorshipid="12036"/>
    </test>
</doc>
© www.soinside.com 2019 - 2024. All rights reserved.