pyquery+mysql抓取赶集网实战

156 阅读2分钟
原文链接: zhuanlan.zhihu.com

本文抓取赶集网小狗数据存储到MySQL

关于pyquery的使用可以参考这篇文章pyquery全面总结

在安装配置好MySQL的情况下,创建名为ganji的数据库,在ganji中创建名为dog的表,然后运行下面代码(里面要填写读者安装mysql时创建的密码)

import requests # 导入请求库
from pyquery import PyQuery as pq # 导入解析库
import pymysql # 用于连接并操作MySQL数据库

def start_requests(url):
    r = requests.get(url)
    return r.content

def parse(text):
    doc = pq(text)
    for li in doc('.petItem'):
        doci = pq(li)
        counts = doci('span.counts').text().split(' ')
        yield {
        'title': doci('.itemTitle').text().replace('\xa0', ''),
        'price': doci('.price').text().split(' ')[0],
        'store': doci('.storeName').text().strip(),
        'score': counts[0],
        'sold': counts[1],
        'url': doci.attr('href')
        }

def main():
    url = 'http://www.xinchong.com/xm/idog/?source=p_ganji_dog_list_1&fcity=xm&gj_other_ca_kw=-&gj_other_ca_s=-&gj_other_sid=&gj_other_gjuser=-&gj_other_ifid=from_ganji&gj_other_gc_1=chongwu&gj_other_city_id=12&gj_other_ca_i=-&gj_other_ca_n=-&gj_other_uuid=-'
    text = start_requests(url)
    connection = pymysql.connect(host='localhost', # 连接数据库
                                 user='root',
                                 password='', # 你安装mysql时设置的密码
                                 db='ganji',
                                 charset='utf8',
                                 cursorclass=pymysql.cursors.DictCursor)
    sql = "insert into dog(title, price, store, score, sold, url)values(%s,%s,%s,%s,%s,%s)"
    try:
        with connection.cursor() as cursor:
            for item in parse(text):
                cursor.execute(sql, (
                    item['title'], item['price'], item['store'], item['score'], item['sold'], item['url']))
        connection.commit() # 提交刚刚执行的SQL处理,使其生效
    finally:
        connection.close()

if __name__ == '__main__':
    main()

抓取结果如下

如果想要不用事先在数据库中创建再用python连接,而是直接在python中创建数据库和表,可以这样

import requests # 导入请求库
from pyquery import PyQuery as pq # 导入解析库
import pymysql # 用于连接并操作MySQL数据库
def start_requests(url):
    r = requests.get(url)
    return r.content
def parse(text):
    doc = pq(text)
    for li in doc('.petItem'):
        doci = pq(li)
        counts = doci('span.counts').text().split(' ')
        yield {
        'title': doci('.itemTitle').text().replace('\xa0', ''),
        'price': doci('.price').text().split(' ')[0],
        'store': doci('.storeName').text().strip(),
        'score': counts[0],
        'sold': counts[1],
        'url': doci.attr('href')
        }
def main():
    url = 'http://www.xinchong.com/xm/idog/?source=p_ganji_dog_list_1&fcity=xm&gj_other_ca_kw=-&gj_other_ca_s=-&gj_other_sid=&gj_other_gjuser=-&gj_other_ifid=from_ganji&gj_other_gc_1=chongwu&gj_other_city_id=12&gj_other_ca_i=-&gj_other_ca_n=-&gj_other_uuid=-'
    text = start_requests(url)
    connection = pymysql.connect(host='localhost', # 少了指定数据库
                                 user='root',
                                 password='', # 你安装mysql时设置的密码
                                 charset='utf8',
                                 cursorclass=pymysql.cursors.DictCursor)
    # 加两条SQL执行语句,创建数据库和表
    # 注意这时每次调用表都要前面先指定数据库(因为connection没有指定哪个数据库了)
    create_db = 'CREATE DATABASE IF NOT EXISTS ganji2;'
    create_table = '''create table ganji2.dog 
    (
    title char(20),
    price char(20),
    store char(20),
    score char(20),
    sold char(20),
    url char(100)
    );
    '''
    # 其他的都没有变
    sql = 'insert into ganji2.dog(title, price, store, score, sold, url)values(%s,%s,%s,%s,%s,%s)'
    try:
        with connection.cursor() as cursor:
            cursor.execute(create_db)
            cursor.execute(create_table)
            for item in parse(text):
                cursor.execute(sql, (
                    item['title'], item['price'], item['store'], item['score'], item['sold'], item['url']))
        connection.commit()
    finally:
        connection.close()
if __name__ == '__main__':
    main()

上面代码的基础已经在前面文章中讲过,有问题可以参考下面文章

专栏信息

专栏主页:python编程

专栏目录:目录

爬虫目录:爬虫系列目录

版本说明:软件及包版本说明