python实现自动抓取某站点内所有超链接 (仅供学习使用)
代码部分 #!/usr/bin/python
import requests import time import re import sys, getopt #命令行选项 from bs4 import BeautifulSoup localtime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #时间 z=[] #存取网站 x=[] #优化网站,去除冗杂部分
def main(argv): url = '' #输入的网址 file_path = '' #保存路径 try: opts, args = getopt.getopt(argv,"hu:f:",["url=","file="]) except getopt.GetoptError: print ('allsite.py -u <url> -f <file>') sys.exit(2) for opt, arg in opts: if opt == '-h': #帮助 print ('allsite.py -u <url> -f <file>') sys.exit() elif opt in ("-u", "--url"): #输入网址 url = arg re1 = requests.get(url) #get网站内容 re1.encoding = "utf-8" html = re1.text bt = BeautifulSoup(html, 'html.parser', ) hh = bt.find_all('a') #查找<a>元素 for site in hh: z.append(site.get('href')) #进一步过滤得到超链接 for i in z: if (re.match('//www', str(i)) or re.match('www', str(i))): xx = str(i).replace('//www', 'www', 1) x.append(xx) elif (re.match('http', str(i))): #过滤 x.append(str(i)) elif (re.match('/', str(i))): #过滤 xx = str(i).replace("/", "", 1) if (re.match('/', xx)): xxx = str(xx).replace("/", "", 1) x.append(xxx) else: x.append(url + xx) else: #过滤 if (re.search('javascript', str(i)) == None): x.append(url + str(i)) print(localtime + " 总共:" + str(len(x)) + "个网址") #输出超链接 for i in x: print(i) elif opt in ("-f", "--file"): #输入保存路径 file_path = arg for i in x: #保存文件 with open(file_path, 'a') as file_object: file_object.write(i) file_object.write('\n')
if __name__ == "__main__": main(sys.argv[1:]) |
|