赞
踩
LXML是功能最丰富的 和易于使用的库 用于处理 XML 和 HTML 在 Python 语言中。
lxml XML 工具包是 C 库 libxml2 和 libxslt 的 Pythonic 绑定。它的独特之处在于它结合了速度和 这些库的 XML 功能完整性与 原生 Python API,大部分兼容,但优于众所周知的 ElementTree API。最新版本适用于所有 CPython 版本 从 2.7 增加到 3.9有关以下内容的更多信息,请参阅简介 LXML项目的背景和目标。
下载源码:
git clone https://github.com/lxml/lxml.git lxml
或者安装库:
pip install lxml
#pip install lxml==3.4.2
import requests
from lxml import etree
res = requests.get("http://www.jsons.cn/zt/")
html = res.text
root_element = etree.HTML(html)
print(root_element)
print(root_element.tag)
#解析HTML字符串 from lxml import etree text = ''' <html><body> <div class="key"> <div class="name">无羡</div> <div class="age">20</div> <div class="address">四川</div> </div> </body></html> ''' # 开始初始化 html = etree.HTML(text) # 这里需要传入一个html形式的字符串 print(html) print(type) # 将字符串序列化为html字符串 result = etree.tostring(html).decode('utf-8') print(result) print(type(result))
from lxml import etree
# 将html文件进行读取
html = etree.parse('1.html')
# 将html内容序列化
result = etree.tostring(html).decode('utf-8')
print(result)
print(type(result))
html = etree.HTML(result) # 这里需要传入一个html形式的字符串
print(html)
print(type)
from lxml import etree
# 创建解析对象
parse_html=etree.HTML(html)
# 书写xpath表达式,提取文本最终使用text()
xpath_bds='//a/text()'
# 提取文本数据,以列表形式输出
r_list=parse_html.xpath(xpath_bds)
# 打印数据列表
print(r_list)
以递归方式遍历树,并用它的元素做一些事情。
>>> root = etree.Element("root") >>> etree.SubElement(root, "child").text = "Child 1" >>> etree.SubElement(root, "child").text = "Child 2" >>> etree.SubElement(root, "another").text = "Child 3" >>> print(etree.tostring(root, pretty_print=True)) <root> <child>Child 1</child> <child>Child 2</child> <another>Child 3</another> </root> >>> for element in root.iter(): ... print("%s - %s" % (element.tag, element.text)) root - None child - Child 1 child - Child 2 another - Child 3 >>> for element in root.iter("child"): ... print("%s - %s" % (element.tag, element.text)) child - Child 1 child - Child 2 >>> for element in root.iter("another", "child"): ... print("%s - %s" % (element.tag, element.text)) child - Child 1 child - Child 2 another - Child 3
>>> root = etree.XML('<root><a><b/></a></root>') >>> etree.tostring(root) b'<root><a><b/></a></root>' >>> print(etree.tostring(root, xml_declaration=True)) <?xml version='1.0' encoding='ASCII'?> <root><a><b/></a></root> >>> print(etree.tostring(root, encoding='iso-8859-1')) <?xml version='1.0' encoding='iso-8859-1'?> <root><a><b/></a></root> >>> print(etree.tostring(root, pretty_print=True)) <root> <a> <b/> </a> </root>
>>> root = etree.Element("root", interesting="totally") >>> etree.tostring(root) b'<root interesting="totally"/>' >>> print(root.get("interesting")) totally >>> print(root.get("hello")) None >>> root.set("hello", "Huhu") >>> print(root.get("hello")) Huhu >>> etree.tostring(root) b'<root interesting="totally" hello="Huhu"/>' >>> sorted(root.keys()) ['hello', 'interesting'] >>> for name, value in sorted(root.items()): ... print('%s = %r' % (name, value)) hello = 'Huhu' interesting = 'totally' >>> attributes = root.attrib >>> print(attributes["interesting"]) totally >>> print(attributes.get("no-such-attribute")) None >>> attributes["hello"] = "Guten Tag" >>> print(attributes["hello"]) Guten Tag >>> print(root.get("hello")) Guten Tag >>> d = dict(root.attrib) >>> sorted(d.items()) [('hello', 'Guten Tag'), ('interesting', 'totally')]
>>> root = etree.Element("root")
>>> root.text = "TEXT"
>>> print(root.text)
TEXT
>>> etree.tostring(root)
b'<root>TEXT</root>'
使用xpath获取所有段落的文本
# -*- coding: UTF-8 -*- from lxml import etree def fetch_text(html): html = etree.HTML(html) result = html.xpath("//p/text()") return result if __name__ == '__main__': html = ''' <html> <head> <title>这是一个简单的测试页面</title> </head> <body> <p class="item-0">body 元素的内容会显示在浏览器中。</p> <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p> </body> </html> ''' imgs = fetch_text(html) print(imgs)
# -*- coding: UTF-8 -*- from lxml import etree def fetch_text(html): html = etree.HTML(html) result = html.xpath("//text()") return result if __name__ == '__main__': html = ''' <html> <head> <title>这是一个简单的测试页面</title> </head> <body> <p>body 元素的内容会显示在浏览器中。</p> <p>title 元素的内容会显示在浏览器的标题栏中。</p> </body> </html> ''' imgs = fetch_text(html) print(imgs)
# -*- coding: UTF-8 -*- from lxml import etree def fetch_text(html): html = etree.HTML(html) result = html.xpath("//p[@class='item-1']/text()") return result if __name__ == '__main__': html = ''' <html> <head> <title>这是一个简单的测试页面</title> </head> <body> <p class="item-0">body 元素的内容会显示在浏览器中。</p> <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p> </body> </html> ''' imgs = fetch_text(html) print(imgs)
如果您觉得该方法或代码有一点点用处,可以给作者点个赞,或打赏杯咖啡;
╮( ̄▽ ̄)╭
如果您感觉方法或代码不咋地
//(ㄒoㄒ)//,就在评论处留言,作者继续改进;
o_O???
如果您需要相关功能的代码定制化开发,可以留言私信作者;
(✿◡‿◡)
感谢各位大佬童鞋们的支持!
( ´ ▽´ )ノ ( ´ ▽´)っ!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。