Python爬取论文信息保存到MYSQL数据库,论文素材有了

240 阅读2分钟

1.设计要爬取的字段,设计数据库表

2.进行爬取,在控制台可以看到爬取的结果

3.将爬取的结果保存到数据库中

4.查看数据库

这里只爬取NIPS上论文的题目、作者、摘要做测试,后续将涉及所有字段的爬取。

代码

craw_db.py:

import pymysql 
import requests
from bs4 import BeautifulSoup

def parse_html(url):
    #使用beautifulSoup进行解析

    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    #题目
    title = soup.find('h2',{'class':"subtitle"})
    title=title.text+""
    # 作者
    author = soup.find_all('li',{'class':"author"})
    authors=""
    for  author_a in author:
          authors= authors+author_a.find('a').text+';'
    # 第一作者单位
    author_company= "qq"
    # 关键词
    keywords ="keywordd"

    #摘要
    abstrcts = soup.find('h3',{'class':"abstract"})
    if abstrcts:
        abstrcts = abstrcts.text.strip()
    else:
        abstrcts="abstractt"
    #会议名称
    confer_name = "NIPS"

    #会议时间
    publish_date ="dateee"

    github_code="github_code"
    paper_url=url
    con_paper="someooo"
    connect = pymysql.Connect(
    host='localhost',
    port=3306,
    user='root',
    passwd='自己的数据库密码',
    db='craw_con',
    charset='utf8')

# 获取游标
    cursor = connect.cursor()

# 插入数据
    sql = "INSERT INTO huiyi(title,keywords,authors,author_company,confer_name,abstrcts,publish_date,github_code,paper_url,con_paper) VALUES ( '%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"
    data = (title,keywords,authors,author_company,confer_name,abstrcts,publish_date,github_code,paper_url,con_paper)
    cursor.execute(sql % data)
    connect.commit()
    print('成功插入', cursor.rowcount, '条数据')

# 关闭连接
    cursor.close()
    connect.close()

def main(url):
    #发送请求、获取响应
    #解析响应
    parse_html(url)

craw_todb.py

import re
import time
from bs4 import BeautifulSoup
import requests
from requests import RequestException
import craw_db
from lxml import etree
def get_page(url):
    try:
        # 添加User-Agent,放在headers中,伪装成浏览器
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            response.encoding = response.apparent_encoding
            return response.text
        return None
    except RequestException as e:
        print(e)
        return None

def get_url(html):
    url_list = []
    soup = BeautifulSoup(htm1.content, 'html.parser')

    ids=soup.find('div',{'class':"main wrapper clearfix"}).find_all("li")

    for id in ids:
     a=id.find('a')
     url_list.append(a.attrs.get('href'))
    return url_list

def get_info(url):
        craw_db.main(url)
if __name__ == '__main__':
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        key_word = input('请输入搜索关键词:')  # 可以交互输入 也可以直接指定
        # 从哪一页开始爬 爬几页
        start_page = int(input('请输入爬取的起始页:'))
        base_url = 'https://papers.nips.cc/search/?q={}&page={}'
        first_url=base_url.format(key_word,start_page)
        htm1 = requests.get(first_url, headers=headers)
        soup = BeautifulSoup(htm1.text, 'html.parser')

        # 总页数
        #pagesum = soup.find('span', class_='searchPageWrap_all').get_text()
        pagesum=5
        for page in range(int(start_page), int(pagesum)):

            new_url = base_url.format(key_word,page)
            # 爬取当前页面 发送请求、获取响应
            html = get_page(new_url)
            # 解析响应 提取当前页面所有论文的url
            url_list = get_url(html)
            for url in url_list:
                # 获取每篇论文的详细信息
                urll="https://papers.nips.cc"+url
                print("url:",urll)
                get_info(urll)
                time.sleep(2)  # 间隔2s

结果

![](https://p9-tt-ipv6.byteimg.com/origin/pgc-image/ecf76f0396254a19957faa84f95bf0e8)
![](https://p1-tt-ipv6.byteimg.com/origin/pgc-image/039aadf5aea843959477b8731547ffdf)

剩下的你知道该怎么做了吧,这个不需要教你吧!

PS:如遇到解决不了问题的小伙伴可以加点击下方链接自行获取

python免费学习资料以及群交流解答点击即可加入