爬虫解析常用工具
日志
from my_utils.logger import line_logger, color_logger, json_logger
爬虫loader
ItemLoader
>>> r = requests.get(url)
>>> loader = ItemLoader(text=r.text)
>>> tr_list = loader.css('.standard-table tbody tr')
>>> for tr in tr_list:
print(tr.xpath('./td[1]//text()').proc(ReFind(r'\S+'))) # default op=TakeFirst()
print(tr.xpath('./td[2]//text()').proc(ReFind(r'\S+'), op=Join()))
print(tr.xpath('./td[3]//text()').proc(lambda x: x.split('/')), op=None)
print(tr.xpath('./td[4]//text()').getall()
JmesLoader
基于 jmespath, jmespath.org/tutorial.ht…
列表中有字符串一定要用单引号,比如:
JmesLoader(result).node("data[?object.question.type=='question']").getall()
>>> src_data = {
"people": [
{"first": "James", "last": "d"},
{"missing": "1111"},
{"first": "Jacob", "last": "e"},
{"first": "Jayden", "last": "f"},
{"missing": "different"}
],
"foo": {"bar": "baz"},
'bar': 1,
}
>>> loader = JmesLoader(src_data)
>>> loader.node('people[?keys(@)[?starts_with(@, `first`)]]')
>>> for item in loader.node('people[?keys(@)[?starts_with(@, `first`)]]'):
b = item.node('first').proc(lambda x: x.upper(), op=None)
c = item.node('firsts').proc(lambda x: x.upper()) # default op=TakeFirst()
d = item.node('last').proc()
e = item.node('last').getall()
print(a, b, c, d, e)
['JAMES'] None d ['d']
['JACOB'] None e ['e']
['JAYDEN'] None f ['f']
>>> print(loader.node('people[][first, missing][]').proc(op=Join(' ')))
James 1111 Jacob Jayden different
常用函数封装
>>> HEADERS = '''
Connection: keep-alive
X-Anit-Forge-Token:
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8
'''
>>> header_to_dict(HEADERS)
{'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
'Connection': 'keep-alive',
'X-Anit-Forge-Token': ''}
>>> cookies = 'a=111111111;b=222'
>>> cookie_to_dict(cookies)
{'a': '111111111', 'b': '222'}