京东商城爬虫

108 阅读3分钟
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import csv
import redis


#输入商品名称
word = input('请输入你想要获取的商品名称:')


#写入标头
header = ['标题','价格','评论','店铺','详情页']
with open(f'jd/{word}.csv', mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    
    
#selenium启动无头浏览器,自动搜索京东搜索网页,自动填入关键字后自动搜索
chrome_options = Options()
driver = webdriver.Chrome("C:\Program Files\Google\Chrome\Application\chromedriver.exe")
driver.get("https://www.jd.com/")
driver.find_element(By.XPATH, '//*[@id="key"]').send_keys(word)
driver.find_element(By.XPATH, '//*[@id="key"]').send_keys(Keys.ENTER)


#自动滑动到最下面
def drop_down():
    for x in range(1,12,2):
        time.sleep(3)
        j = x / 9
        js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight = %f' % j
        driver.execute_script(js)
        
        
#获取商品数据并写入csv文件里
def get_shop_info():
    driver.implicitly_wait(10)
    drop_down()
    lis = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul//li')
    for li in lis:
        title = li.find_element(By.XPATH, './/a[@target="_blank"]/em').text.replace('\n','')
        price = li.find_element(By.XPATH, './div/div[2]/strong/i').text
        #评论条数
        comment = li.find_element(By.XPATH, './/*[contains(@id,"J_comment_")]').text
        
        
        try:
            #店铺名称
            shop_name = li.find_element(By.XPATH, './div/div[5]/span/a[contains(@target,"_blank")]').text
        except NoSuchElementException:
            shop_name = "暂无店铺"
            
            
        #详情页URL地址
        href = li.find_element(By.XPATH, './div/div[3]/a').get_attribute("href")
        print(title, price, comment, shop_name, href)
        
        
        lst = [title, price, comment, shop_name, href]
        with open(f'jd/{word}.csv','a',newline='',encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(lst)
            
            
# redis-server启动Redis服务器
# redis-cli.exe -h 127.0.0.1 -p 6379连接到 Redis 服务器的命令行工具
#连接Redisinsight在第4个数据库DB4中写入数据
def input_redis():
    r = redis.StrictRedis(host='127.0.0.1', port=6379, db=4)
    with open(f'jd/{word}.csv', 'rt', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            title = row['标题']
            price = row['价格']
            comment = row['评论']
            shop_name = row['店铺']
            href = row['详情页']
            print("==================================")
            print(row)
            r.hmset(title,{
                '标题': title,'价格': price,
                '评论': comment,'店铺': shop_name,
                '详情页': href})
                
                
#依托selenium自动翻页
for page in range(1,100):
    print(f'====================正在采集第{page}页的数据内容====================')
    get_shop_info()
    time.sleep(5)
    driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').send_keys(Keys.ARROW_RIGHT)
    input_redis()
    time.sleep(10)
#退出Chromedriver
driver.quit()
```
```


1、word = input('请输入你想要获取的商品名称:')


2、chrome_options = Options() 
driver = webdriver.Chrome("C:\Program Files\Google\Chrome\Application\chromedriver.exe")


3、driver.get("https://www.jd.com/")


4、driver.find_element(By.XPATH, '//*[@id="key"]').send_keys(word)
driver.find_element(By.XPATH, '//*[@id="key"]').send_keys(Keys.ENTER)


5、#获取商品数据并写入csv文件里 
def get_shop_info(): 
driver.implicitly_wait(10) 
drop_down() 
lis = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul//li') 
for li in lis: 
title = li.find_element(By.XPATH, './/a[@target="_blank"]/em').text.replace('\n','')


6、try:
            #店铺名称
            shop_name = li.find_element(By.XPATH, './div/div[5]/span/a[contains(@target,"_blank")]').text
        except NoSuchElementException:
            shop_name = "暂无店铺"
            
            
7、lst = [title, price, comment, shop_name, href]
with open(f'jd/{word}.csv','a',newline='',encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(lst)
    
    
8、# redis-server启动Redis服务器 
# redis-cli.exe -h 127.0.0.1 -p 6379连接到 Redis 服务器的命令行工具 
#连接Redisinsight在第4个数据库DB4中写入数据 
def input_redis(): 
r = redis.StrictRedis(host='127.0.0.1', port=6379, db=4) 
with open(f'jd/{word}.csv', 'rt', encoding='utf-8') as csv_file: 
reader = csv.DictReader(csv_file) 
for row in reader: 
title = row['标题'] 
price = row['价格'] 
comment = row['评论'] 
shop_name = row['店铺'] 
href = row['详情页'] 
print("==================================") 
print(row) 
r.hmset(title,{ '标题': title,'价格': price, '评论': comment,'店铺': shop_name, '详情页': href})


9、#写入标头 
header = ['标题','价格','评论','店铺','详情页'] 
with open(f'jd/{word}.csv', mode='w', encoding='utf-8', newline='') as file: 
writer = csv.writer(file) 
writer.writerow(header)


10、#自动滑动到最下面 
def drop_down(): 
for x in range(1,12,2): 
time.sleep(3) 
j = x / 9 
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight = %f' % j driver.execute_script(js)


11、#依托selenium自动翻页 
for page in range(1,100): 
print(f'====================正在采集第{page}页的数据内容====================') get_shop_info() 
time.sleep(5) 
driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').send_keys(Keys.ARROW_RIGHT) 
input_redis() 
time.sleep(10) 
#退出Chromedriver 
driver.quit()