丽水县政府文章爬虫

31 阅读1分钟
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
import scrapy
from scrapy.http import Request
from urllib import parse
import json
import time
import http.client

url = "https://www.lishui.gov.cn/col/col1229265122/index.html?df=/col/col1229286995/index.html&isgk=1"
headers = {
    # '':'',Accept
    'Accept':'*/*',
    # '':'',
    'Referer': 'https://www.lishui.gov.cn/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
# 响应头(Response Headers)可以通过 response.headers 属性获取
# 打印所有的响应头
response = requests.get(url=url, headers=headers)
# for header, value in response.headers.items():
#     print(f'{header}: {value}')
#
# response
# for header, value in response.getheaders():
#     print(f'{header}: {value}')

# print(response.content.decode(encoding = 'utf-8'))
tree = etree.HTML(response.content.decode(encoding='utf8'))
print(11111,tree.text)
href = tree.xpath('//*[@id="7853602"]/div/table/tbody/tr[2]/td[1]/a/@href')
print(22222,href,response.status_code)
# for element in elements:
#     print(element.text)
soup = BeautifulSoup(response.content.decode(encoding = 'utf-8'), 'html.parser')
# big = soup.find('div', class_ = 'default_pgContainer')
#
# href = soup.find('a')['href']
# print(href)
href = soup.find(xpath = '//*[@id="7853602"]/div/table/tbody/tr[2]/td[1]/a/@href')
print(333333,href)
```
```


1、首先,import requests库,url = "https://www.lishui.gov.cn/col/col1229265122/index.html?df=/col/col1229286995/index.html&isgk=1",这个url是入口url。
2、headers请求头包含Accept、Referer、User-Agent。
3、response = requests.get(url=url, headers=headers)
4、tree = etree.HTML(response.content.decode(encoding='utf8'))  from lxml import etree
5、href = tree.xpath('//*[@id="7853602"]/div/table/tbody/tr[2]/td[1]/a/@href')
6、soup = BeautifulSoup(response.content.decode(encoding = 'utf-8'), 'html.parser')  from bs4 import BeautifulSoup