在使用 elementtree 解析 XML 文档时,有时需要忽略具有特定属性值的节点。例如,在给定的 XML 文档中,我们需要忽略具有属性 name="Liechtenstein" 和 month="08" 的节点,并解析其余部分。
XML 文档示例:
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<language>english</language>
<currency>1.21$/kg</currency>
<gdppc month="06">141100</gdppc>
<gdpnp month="10">2.304e+0150</gdpnp>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<language>english</language>
<currency>4.1$/kg</currency>
<gdppc month="05">59900</gdppc>
<gdpnp month="08">5.2e-015</gdpnp>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Lahore">
<rank updated="yes">8</rank>
<language>Pertr</language>
<currency>7.3$/kg</currency>
<gdppc month="010">34000</gdppc>
<gdpnp month="099">3.4e+015</gdpnp>
<neighbor name="Peru" direction="N"/>
</country>
</data>
2、解决方案 有两种方法可以实现忽略特定属性值的节点的解析:
(1) 使用 xml.etree.ElementTree 模块
import xml.etree.ElementTree as PARSER
def parse_xml_ignore_nodes(xml_data, name_attribute, month_attribute):
result = []
root = PARSER.fromstring(xml_data)
for country in root.getiterator("country"):
if name_attribute in country.attrib and country.attrib[name_attribute] == "Liechtenstein":
continue
tmp = []
for child in country.getchildren():
if month_attribute in child.attrib and child.attrib[month_attribute] == "08":
continue
if child.text:
tmp.append(child.text)
result.append(tmp)
return result
xml_data = """
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<language>english</language>
<currency>1.21$/kg</currency>
<gdppc month="06">141100</gdppc>
<gdpnp month="10">2.304e+0150</gdpnp>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<language>english</language>
<currency>4.1$/kg</currency>
<gdppc month="05">59900</gdppc>
<gdpnp month="08">5.2e-015</gdpnp>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Lahore">
<rank updated="yes">8</rank>
<language>Pertr</language>
<currency>7.3$/kg</currency>
<gdppc month="010">34000</gdppc>
<gdpnp month="099">3.4e+015</gdpnp>
<neighbor name="Peru" direction="N"/>
</country>
</data>
"""
result = parse_xml_ignore_nodes(xml_data, "name", "month")
print(result)
输出结果:
[['5', 'english', '4.1$/kg', '59900'], ['8', 'Pertr', '7.3$/kg', '34000', '3.4e+015']]
(2) 使用 lxml 模块
import lxml.html as PARSER
def parse_xml_ignore_nodes_lxml(xml_data, name_attribute, month_attribute):
result = []
root = PARSER.fromstring(xml_data)
for country in root.xpath(f"//country[@{name_attribute}!='Liechtenstein']"):
tmp = []
for child in country.getchildren():
if month_attribute in child.attrib and child.attrib[month_attribute] == "08":
continue
if child.text:
tmp.append(child.text)
result.append(tmp)
return result
xml_data = """
<data>
<country name="Liechtenstein">
<rank updated="yes">2</rank>
<language>english</language>
<currency>1.21$/kg</currency>
<gdppc month="06">141100</gdppc>
<gdpnp month="10">2.304e+0150</gdpnp>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank updated="yes">5</rank>
<language>english</language>
<currency>4.1$/kg</currency>
<gdppc month="05">59900</gdppc>
<gdpnp month="08">5.2e-015</gdpnp>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Lahore">
<rank updated="yes">8</rank>
<language>Pertr</language>
<currency>7.3$/kg</currency>
<gdppc month="010">34000</gdppc>
<gdpnp month="099">3.4e+015</gdpnp>
<neighbor name="Peru" direction="N"/>
</country>
</data>
"""
result = parse_xml_ignore_nodes_lxml(xml_data, "name", "month")
print(result)
输出结果:
[['5', 'english', '4.1$/kg', '59900'], ['8', 'Pertr', '7.3$/kg', '34000', '3.4e+015']]
希望这些代码对你有帮助!