1、 简介
使用的接口
多边形搜索接口搜索区域内POI数据行政区域查询接口-搜索区域经纬度范围
主要基于 多边形搜索接口 进行爬取, 尽可能的爬取更多的POI。
包含网格划分、并发爬取、射线法区域过滤、断点重爬(即电脑关机重启后再执行程序可从上次爬取位置继续爬避免重复执行)
2、代码实现
2.1、PoiUtil.py
由于只要爬取国内的POI数据, 所以先从经纬度轮廓查询网址 获取出中国的大致轮廓范围到coordinates对象, 然后在之上封装实现了射线法去判断某个坐标是否在中国内。
射线法: 从一个点做一条射线如果与多边形交点数量为奇数就是在多边形内
# 中国经纬度大致轮廓
coordinates = [
[
95.625,
44.276671273775186
],
[
91.845703125,
44.15068115978094
],
[
88.41796875,
45.89000815866184
],
[
87.49511718750003,
49.03786794532641
],
[
83.62792968750003,
46.67959446564017
],
[
84.15527343750001,
42.84375132629021
],
[
79.01367187500001,
41.11246878918086
],
[
76.11328125000001,
39.40224434029275
],
[
77.34375000000003,
35.85343961959177
],
[
80.55175781250003,
36.066862132578834
],
[
78.22265625,
32.43561304116276
],
[
87.75878906250003,
27.0982539061379
],
[
90.966796875,
29.38217507514529
],
[
95.14160156250003,
30.03105542654018
],
[
98.21777343750003,
28.18824364185026
],
[
99.140625,
26.745610382199022
],
[
97.33886718750001,
24.20688962239801
],
[
101.16210937500001,
21.125497636606266
],
[
101.84326171875,
21.555284406923192
],
[
101.60156250000004,
22.350075806124853
],
[
104.23828125,
23.079731762449878
],
[
105.3204345703125,
23.306946133072103
],
[
105.567626953125,
23.069624397708267
],
[
106.67724609375,
21.922663209325922
],
[
108.050537109375,
21.427503474404084
],
[
108.6712646484375,
21.570610571132665
],
[
109.1326904296875,
21.361013117950915
],
[
109.8028564453125,
21.427503474404084
],
[
109.5611572265625,
20.863944849076905
],
[
109.88525390624999,
20.122997556207757
],
[
108.56689453125,
19.673625561844393
],
[
108.6053466796875,
18.46397859132042
],
[
109.6160888671875,
18.124970639386515
],
[
110.5828857421875,
18.81791748264768
],
[
110.687255859375,
19.25929414046391
],
[
111.07177734375,
19.663280219987662
],
[
110.9893798828125,
20.122997556207757
],
[
110.4510498046875,
20.251890313952938
],
[
110.56640625,
20.49906428341304
],
[
110.4730224609375,
20.797201434307
],
[
110.6982421875,
20.90500300215911
],
[
110.7366943359375,
21.34054846908118
],
[
111.0113525390625,
21.391704731036587
],
[
111.2640380859375,
21.39937662852242
],
[
111.58264160156251,
21.488852324870077
],
[
112.00836181640625,
21.593596778619787
],
[
112.0880126953125,
21.75439787437119
],
[
112.2967529296875,
21.675295939487672
],
[
112.42172241210938,
21.64594062309775
],
[
112.51373291015625,
21.58593511478851
],
[
112.70736694335936,
21.642111228517322
],
[
112.76229858398436,
21.55911609985186
],
[
112.92022705078125,
21.60636531720339
],
[
112.8900146484375,
21.815608175662636
],
[
113.291015625,
21.84620351827813
],
[
113.45855712890625,
22.009267904493782
],
[
113.7249755859375,
22.179775161509696
],
[
114.27429199218749,
22.13653163760967
],
[
114.5819091796875,
22.405950148725722
],
[
115.147705078125,
22.760986169250472
],
[
115.56518554687499,
22.621616907276728
],
[
116.55120849609375,
22.94574762338832
],
[
116.61712646484375,
23.140359987886118
],
[
116.8341064453125,
23.23882237936205
],
[
116.86431884765625,
23.387640227334956
],
[
117.14996337890625,
23.392681978612988
],
[
117.62786865234375,
23.70740932619981
],
[
118.07556152343749,
24.224423957057557
],
[
118.23486328125,
24.369615456124507
],
[
118.94897460937499,
24.701924833689933
],
[
120.06958007812499,
25.423431426334222
],
[
120.794677734375,
24.816653556469955
],
[
119.718017578125,
23.704894502324912
],
[
120.750732421875,
21.69826549685252
],
[
121.33300781249999,
22.917922936146045
],
[
121.9921875,
24.826624956562167
],
[
122.10205078125,
25.21488107113259
],
[
121.48681640624999,
25.423431426334222
],
[
119.718017578125,
25.859223554761407
],
[
120.2783203125,
26.78484736105119
],
[
120.95947265624999,
27.595934774495056
],
[
121.75048828124999,
28.372068829631633
],
[
122.73925781250001,
30.259067203213018
],
[
121.39892578125,
32.87036022808352
],
[
120.66284179687499,
33.815666308702774
],
[
119.61914062500001,
35.02999636902568
],
[
121.24511718750003,
36.491973470593656
],
[
122.37670898437499,
36.721273880045004
],
[
122.93701171874999,
37.36142550190517
],
[
121.06933593750001,
38.09998264736478
],
[
119.61914062500001,
37.54457732085582
],
[
119.53125000000003,
38.16911413556081
],
[
117.75146484375,
38.634036452919226
],
[
118.91601562500001,
39.027718840211605
],
[
121.24511718750003,
40.74725696280421
],
[
122.08007812500001,
40.713955826286025
],
[
121.06933593750001,
39.06184913429151
],
[
121.37695312500001,
38.61687046392973
],
[
123.09082031250001,
39.53793974517623
],
[
125.068359375,
40.48038142908172
],
[
126.71630859375,
41.672911819602085
],
[
128.056640625,
41.40153558289846
],
[
128.001708984375,
42.08191667830631
],
[
129.13330078125,
42.21224516288584
],
[
129.913330078125,
42.96446257387128
],
[
130.62744140624997,
42.56926437219384
],
[
131.1328125,
44.84029065139799
],
[
132.82470703125,
45.089035564831036
],
[
134.033203125,
46.830133640447386
],
[
134.9560546875,
48.50204750525715
],
[
131.0009765625,
47.60616304386874
],
[
129.48486328125,
49.25346477497736
],
[
127.19970703125,
49.76707407366792
],
[
125.5078125,
52.802761415419674
],
[
122.87109375,
53.74871079689897
],
[
120.498046875,
53.25206880589411
],
[
120.234375,
50.93073802371819
],
[
117.2900390625,
49.296471602658066
],
[
115.83984375,
48.3416461723746
],
[
119.75097656250003,
47.18971246448421
],
[
111.884765625,
43.8028187190472
],
[
104.765625,
41.11246878918088
],
[
96.064453125,
42.74701217318067
],
[
95.625,
44.276671273775186
]
]
coordinates.reverse()
def isIntersect(poi, a, b):
"""
用该点poi做一条射线判断是否与线段ab相交
"""
if a[1] == b[1]:
# 排除与射线平行、重合,线段首尾端点重合的情况
return False
if a[1] > poi[1] and b[1] > poi[1]:
# 线段在射线上边
return False
if a[1] < poi[1] and b[1] < poi[1]:
# 线段在射线下边
return False
if a[1] == poi[1] and b[1] > poi[1]:
# 排除交点为a端点,不算相交
return False
if b[1] == poi[1] and a[1] > poi[1]:
# 交点为b端点,不算相交
return False
# 基于等比关系计算交点在射线上的x坐标
xseg = b[0] - (b[0] - a[0]) * (b[1] - poi[1]) / (b[1] - a[1]) # 求交
if xseg < poi[0]:
# 如果交点在射线起点的左侧
return False
return True
def isInRange(poi, edge_list: list):
# 计算交点个数
count = 0
for i in range(len(edge_list) - 1):
# 遍历每条边 a -> b
a = edge_list[i]
b = edge_list[i + 1]
# 判断该点与边是否相交
if isIntersect(poi, a, b):
count += 1
# 如果交点个数为奇数表示在范围内
return True if count % 2 == 1 else False
def isInChina(poi):
"""
判断是否在中国范围内
:param poi: 经纬度坐标
"""
return isInRange(poi, coordinates)
if __name__ == '__main__':
a = isInChina(poi=[121.67864690856112, 34.89095702880306])
print(a)
2.2、爬取工具 GaodeUtil.py
调用高德地图的API需要密钥KEY,将代码里的该KEY替换为自己的
主要封装大范围的矩形范围爬取逻辑,会将该大范围的矩形划分为一个个细粒度的网格, 再并发取爬取该网格内的POI数据, 最后再汇总每个文件的爬取结果。
import os
import re
import time
import traceback
import concurrent.futures as cft
import requests
import requests.adapters
from 脚本.B6_爬取高德POI import PoiUtil
# 多边形搜索接口
Road_Url = "https://restapi.amap.com/v3/place/polygon"
# 行政区域查询接口
District_Url = "https://restapi.amap.com/v3/config/district"
page_size = 25
# 高德的KEY todo 替换为自己的开发者key
KEY = ""
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
session = requests.session()
session.keep_alive = False
def findDistinct(area: str):
"""搜索某个地区的经纬度范围、
:param area: 搜索的地区, 比如中国, 上海
:return:
"""
pm = {
"key": KEY,
"keywords": area,
"extensions": "all"
}
ret: dict = requests.get(District_Url, params=pm).json()
polylineList: str = ret["districts"][0]["polyline"].split(";")
# 纬度
maxlatitude = -1
minlatitude = 10000
# 经度
maxlongitude = -1
minlongitude = 10000
for e in polylineList:
longitude_latitude = e.split(",")
longitude = longitude_latitude[0]
latitude = longitude_latitude[1]
if latitude.find("|") > 0:
latitude = latitude.split("|")[0]
if longitude.find("|") > 0:
pass
maxlatitude = max(maxlatitude, float(latitude))
minlatitude = min(minlatitude, float(latitude))
maxlongitude = max(maxlongitude, float(longitude))
minlongitude = min(minlongitude, float(longitude))
print("===========================================================================")
print("最大纬度", maxlatitude)
print("最小纬度", minlatitude)
print("最大经度", maxlongitude)
print("最小经度", minlongitude)
print("===========================================================================")
def generalID(maxlongitude, minlongitude, maxlatitude, minlatitude, latitude=0.018, longitude=0.018):
""" 将某个经纬度范围的大网格划分为 column_num x row_num 个的 小经纬度范围的网格
:param column_num: 划分成几列
:param row_num: 划分成几行
:param maxlongitude: 最大的经度
:param minlongitude: 最小的经度
:param maxlatitude: 最大的纬度
:param minlatitude: 最小的纬度
:return:
"""
row_num = int((maxlatitude - minlatitude) / latitude) + 1
column_num = int((maxlongitude - minlongitude) / longitude) + 1
print("===========================================================================")
print("maxlatitude", maxlatitude)
print("minlatitude", minlatitude)
print("maxlongitude", maxlongitude)
print("minlongitude", minlongitude)
print("latitude", latitude, "row_num", row_num)
print("longitude", longitude, "column_num", column_num)
print("===========================================================================")
polylists = []
for i in range(column_num):
left_longitude = minlongitude + longitude * i
right_longitude = minlongitude + longitude * (i + 1)
for j in range(row_num):
down_latitude = minlatitude + latitude * j
up_latitude = minlatitude + latitude * (j + 1)
""" right_longitude
| |
-------------|--- up_latitude
| |
| |
down_latitude -------------|
|
|
left_longitude
"""
tmp = f"{left_longitude},{up_latitude}|{right_longitude},{down_latitude}"
# 如果该经纬度范围不在中国范围内,则跳过
x1 = [left_longitude, up_latitude]
x2 = [right_longitude, up_latitude]
x3 = [left_longitude, down_latitude]
x4 = [right_longitude, down_latitude]
if not PoiUtil.isInChina(x1) and not PoiUtil.isInChina(x2) and not PoiUtil.isInChina(
x3) and not PoiUtil.isInChina(x4):
continue
polylists.append(tmp)
print("经纬度块个数", len(polylists))
return polylists
def getRoad(polygon_pair, page, types, save_file) -> int:
""" 获取一个经纬度范围内的 catch_type 类型的数据
:param polygon_pair: 经纬度范围,格式 经度:纬度|经度:纬度
:param page: 请求第几页
:return:
"""
pm = {
"key": KEY,
"polygon": polygon_pair,
"types": types,
"offset": page_size,
"page": page,
"extensions": all,
"output": "json"
}
response = None
try:
response: dict = session.get(Road_Url, params=pm, headers={'Connection': 'close'},).json()
except Exception as r:
print(f'\033[1;31;0m 请求异常,稍后重试, 异常信息[{str(r)}] --页数[{page}] --块{[os.path.basename(save_file.name)]} \033[0m')
time.sleep(10)
return getRoad(polygon_pair=polygon_pair, page=page, types=types, save_file=save_file)
# 没有请求成功再次调用
if not response:
print("没有请求成功再次调用")
time.sleep(10)
return getRoad(polygon_pair=polygon_pair, page=page, types=types, save_file=save_file)
# {'status': '0', 'info': 'ENGINE_RESPONSE_DATA_ERROR', 'infocode': '30001'}
if response.get("pois") is None:
print("错误响应", response)
return 0
poisList = response["pois"]
for e in poisList:
# 返回结果: {'parent': [], 'address': '南湖区', 'distance': [], 'pname': '浙江省', 'importance': [], 'biz_ext': {'cost': [], 'rating': []}, 'biz_type': [], 'cityname': '嘉兴市', 'type': '地名地址信息;交通地名;道路名', 'photos': [], 'typecode': '190301', 'shopinfo': '2', 'poiweight': [], 'childtype': [], 'adname': '南湖区', 'name': '新大公路', 'location': '120.866035,30.717127', 'tel': [], 'shopid': [], 'id': 'BZA9QV00UI'}
save_file.write(str(e))
save_file.write("\n")
save_file.flush()
return len(poisList)
def task(saveDir, types, poylineList, start, end):
""" 对 poylineList[start:end] 范围内的所有块进行爬取
:param saveDir:
:param types:
:param poylineList:
:param start:
:param end:
:return:
"""
request_count = 0
poi_count = 0
try:
savePath = saveDir + f"/{start}-{end}.txt"
logPath = saveDir + f"/log/{start}-{end}.log"
# 从日志继续处理 断点重爬
last_start = None
log = f"[{start}-{end}]"
if os.path.exists(logPath):
with open(logPath, 'r') as f:
lines = f.readlines()
if lines:
obj = re.search(r"\[(\d+)\]", lines[-1])
if obj:
start = int(obj.group(1)) + 1
last_start = start - 1
f.close()
save_file = open(savePath, 'a+')
log_file = open(logPath, 'a+')
start_time = time.time()
for i in range(start, end):
for j in range(1, 100):
request_count += 1
count = getRoad(polygon_pair=poylineList[i], page=j, types=types, save_file=save_file)
poi_count += count
if count < page_size:
break
log_file.writelines(f"处理: [{i}] 块" + "请求数量: {}".format(request_count))
log_file.writelines("\n")
log_file.flush()
save_file.close()
log_file.close()
end_time = time.time()
print("执行耗时: ", (end_time - start_time), "请求数量: ", request_count, "处理范围", log, "poi数量", poi_count, "上次执行",
last_start)
except Exception as r:
traceback.print_exc()
return request_count, poi_count
def catchByAsync(saveDir, types, maxlongitude, minlongitude, maxlatitude, minlatitude, task_count):
"""
矩形并发抓取国内POI数据
:param saveDir: 保存位置
:param types: 爬取的POI分类, 支持多类别用|分割
:param maxlongitude: 最大经度
:param minlongitude: 最小经度
:param maxlatitude: 最大纬度
:param minlatitude: 最小纬度
:param task_count: 并发数量
:param isFilter:
:return:
"""
start_time = time.time()
# 1、网格分块
poylineList = generalID(maxlatitude=maxlatitude,
minlatitude=minlatitude,
maxlongitude=maxlongitude,
minlongitude=minlongitude)
# 初始化目录
if not os.path.exists(saveDir):
os.makedirs(saveDir)
if not os.path.exists(saveDir + "/log"):
os.makedirs(saveDir + "/log")
# 2、划分子任务 -每个线程负责自己的块
exe_count = int(len(poylineList) / task_count)
rangeList = []
i = 0
j = exe_count
rangeList.append([i, j])
while True:
i = j
j += exe_count
if j >= len(poylineList):
rangeList.append([i, len(poylineList)])
break
rangeList.append([i, j])
# 3、并发执行
task_list = []
pool = cft.ThreadPoolExecutor(max_workers=100)
for e in rangeList:
task_list.append(pool.submit(task, saveDir, types, poylineList, e[0], e[1]))
count = 0
pcount = 0
for future in cft.as_completed(task_list): # 阻塞等待所有任务返回结果
result = future.result()
count += result[0]
pcount += result[1]
print("总耗时: ", (time.time() - start_time), "总请求数量: ", count, "总poi数量", pcount)
# 4、汇总子任务结果
print("开始汇总子任务结果")
start_time = time.time()
# 获取最大文件, 从它开始合并
fileList = os.listdir(saveDir)
max_file_name = ""
max_size = -1
for i in range(len(fileList)):
path = os.path.join(saveDir, fileList[i])
if os.path.isfile(path):
size = os.path.getsize(path)
if size > max_size:
max_size = size
max_file_name = fileList[i]
total_fileName = os.path.join(saveDir, max_file_name)
total_file = open(total_fileName, 'a+')
for i in range(len(rangeList)):
name = f"{rangeList[i][0]}-{rangeList[i][1]}.txt"
if name != max_file_name:
tmp_file = open(os.path.join(saveDir, name), 'r')
while True:
lines = tmp_file.readlines(10000)
if not lines:
break
for line in lines:
total_file.writelines(line) # line已经携带换行符
total_file.flush()
tmp_file.close()
os.system(f"rm -f {os.path.join(saveDir, name)}")
os.system(f"cp {total_fileName} {os.path.dirname(total_fileName) + '/' + types + '.txt'}")
print("汇总耗时: ", (time.time() - start_time))
3、测试
3.1、先获取某个地区的大致范围
if __name__ == '__main__':
GaodeUtil.findDistinct("深圳市")
结果: 深圳市的大致经纬度范围如下
最大纬度 22.861748
最小纬度 22.396344
最大经度 114.628466
最小经度 113.751453
3.2、爬取该范围内的所有POI数据
由于该范围过大,如果通过接口搜索POI一般能见度很低搜索不到多少POI地址, 所以会进行网格划分进行更细粒度的爬取
if __name__ == '__main__':
#type = "商务住宅"
type = "汽车服务|汽车销售|汽车维修|摩托车服务|餐饮服务|购物服务|生活服务|体育休闲服务|医疗保健服务|住宿服务|风景名胜|商务住宅|政府机构及社会团体|科教文化服务|交通设施服务|金融保险服务|公司企业|道路附属设施|地名地址信息|公共设施|室内设施|通行设施"
dir = "/Users/burukeyou/Documents/code/python/demo/脚本/B6_爬取高德POI/结果"
# 深圳市
GaodeUtil.catchByAsync(saveDir=dir + f"/{type}",
types=type,
maxlongitude=114.628466,
minlongitude=113.751453,
maxlatitude=22.861748,
minlatitude=22.396344,
task_count=10)
结果目录介绍:
- 会将每个范围的POI结果地址单独放置,最后会汇总放到一个文件。 log目录是该范围的爬取日志,支持异常重爬, 如果存在log目录,每次会从该日志文件的最新位置开始爬取防止重复爬取。