项目编号:2024-2025-BS-BD-010
一,环境介绍
语言环境:Python3.8
数据库:Mysql: mysql5.7
WEB框架:Django
开发工具:IDEA或PyCharm
二,项目简介
在经济整体下滑的情况下,旅游业成为各地的主要特色经济收入,全国各地的旅游部门也在采用各种不同的策略来提振当地旅游业。而作为普通消费者,旅游业已成为生活中重要的一个组成部分,但是众多的旅游景点让很多消费者无所选择,无法从海量的旅游数据中找到自已喜欢的旅游景点,这是目前所存在的一个主要问题。
本文提出了一种基于大数据的旅游推荐系统,该系统采用Vue作为前端开发框架,结合Python和Django作为后端,Hive作为大数据处理框架进行开发。系统设计包括多个功能模块,如旅游景点模块、门票订单模块、酒店预定模块、旅游线路模块以及分享论坛模块。通过整合各模块功能,系统能够实时分析用户行为数据,为用户提供个性化的旅游推荐服务,提高用户的旅行体验和决策效率。系统的实现展示了大数据技术在旅游行业中的应用潜力,并为旅游推荐系统的开发提供了新的思路。
编辑
三,系统展示
编辑
编辑
四,核心代码展示
# # -*- coding: utf-8 -*-
# 数据爬取文件
import scrapy
import pymysql
import pymssql
from ..items import SightinfoItem
import time
from datetime import datetime,timedelta
import datetime as formattime
import re
import random
import platform
import json
import os
import urllib
from urllib.parse import urlparse
import requests
import emoji
import numpy as np
from DrissionPage import Chromium
import pandas as pd
from sqlalchemy import create_engine
from selenium.webdriver import ChromeOptions, ActionChains
from scrapy.http import TextResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
from sqlalchemy import create_engine
from selenium.webdriver import ChromeOptions, ActionChains
from scrapy.http import TextResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# 景点信息
class SightinfoSpider(scrapy.Spider):
name = 'sightinfoSpider'
spiderUrl = 'https://m.ctrip.com/restapi/soa2/18109/json/getAttractionList?_fxpcqlniredt=09031015316821670659&x-traceID=09031015316821670659-1739163403719-8167870'
start_urls = spiderUrl.split(";")
protocol = ''
hostname = ''
realtime = False
def __init__(self,realtime=False,*args, **kwargs):
super().__init__(*args, **kwargs)
self.realtime = realtime=='true'
def start_requests(self):
plat = platform.system().lower()
if not self.realtime and (plat == 'linux' or plat == 'windows'):
connect = self.db_connect()
cursor = connect.cursor()
if self.table_exists(cursor, 'qeo23287_sightinfo') == 1:
cursor.close()
connect.close()
self.temp_data()
return
pageNum = 1 + 1
for url in self.start_urls:
if '{}' in url:
for page in range(1, pageNum):
next_link = url.format(page)
yield scrapy.Request(
url=next_link,
callback=self.parse
)
else:
yield scrapy.Request(
url=url,
callback=self.parse
)
# 列表解析
def parse(self, response):
_url = urlparse(self.spiderUrl)
self.protocol = _url.scheme
self.hostname = _url.netloc
plat = platform.system().lower()
if not self.realtime and (plat == 'linux' or plat == 'windows'):
connect = self.db_connect()
cursor = connect.cursor()
if self.table_exists(cursor, 'qeo23287_sightinfo') == 1:
cursor.close()
connect.close()
self.temp_data()
return
data = json.loads(response.body)
try:
list = data["attractionList"]
except:
pass
for item in list:
fields = SightinfoItem()
try:
fields["poiname"] = emoji.demojize(self.remove_html(str( item["card"]["poiName"] )))
except:
pass
try:
fields["imgurl"] = emoji.demojize(self.remove_html(str( item["card"]["coverImageUrl"] )))
except:
pass
try:
fields["commentscore"] = float( item["card"]["commentScore"])
except:
pass
try:
fields["commentcount"] = int( item["card"]["commentCount"])
except:
pass
try:
fields["heatscore"] = float( item["card"]["heatScore"])
except:
pass
try:
fields["jiage"] = float( item["card"]["price"])
except:
pass
try:
fields["districtname"] = emoji.demojize(self.remove_html(str( item["card"]["districtName"] )))
except:
pass
try:
fields["zonename"] = emoji.demojize(self.remove_html(str( item["card"]["zoneName"] )))
except:
pass
try:
fields["sightcategoryinfo"] = emoji.demojize(self.remove_html(str( item["card"]["sightCategoryInfo"] )))
except:
pass
try:
fields["sightlevel"] = emoji.demojize(self.remove_html(str( item["card"]["sightLevelStr"] )))
except:
pass
try:
fields["features"] = emoji.demojize(self.remove_html(str(' '.join( item["card"]["shortFeatures"]) )))
except:
pass
try:
fields["tagname"] = emoji.demojize(self.remove_html(str(' '.join( item["card"]["tagNameList"]) )))
except:
pass
try:
fields["detailurl"] = emoji.demojize(self.remove_html(str( item["card"]["detailUrl"] )))
except:
pass
yield fields
# 详情解析
def detail_parse(self, response):
fields = response.meta['fields']
return fields
# 数据清洗
def pandas_filter(self):
engine = create_engine('mysql+pymysql://root:123456@localhost/spiderqeo23287?charset=UTF8MB4')
df = pd.read_sql('select * from sightinfo limit 50', con = engine)
# 重复数据过滤
df.duplicated()
df.drop_duplicates()
#空数据过滤
df.isnull()
df.dropna()
# 填充空数据
df.fillna(value = '暂无')
# 异常值过滤
# 滤出 大于800 和 小于 100 的
a = np.random.randint(0, 1000, size = 200)
cond = (a<=800) & (a>=100)
a[cond]
# 过滤正态分布的异常值
b = np.random.randn(100000)
# 3σ过滤异常值,σ即是标准差
cond = np.abs(b) > 3 * 1
b[cond]
# 正态分布数据
df2 = pd.DataFrame(data = np.random.randn(10000,3))
# 3σ过滤异常值,σ即是标准差
cond = (df2 > 3*df2.std()).any(axis = 1)
# 不满⾜条件的⾏索引
index = df2[cond].index
# 根据⾏索引,进⾏数据删除
df2.drop(labels=index,axis = 0)
# 去除多余html标签
def remove_html(self, html):
if html == None:
return ''
pattern = re.compile(r'<[^>]+>', re.S)
return pattern.sub('', html).strip()
# 数据库连接
def db_connect(self):
type = self.settings.get('TYPE', 'mysql')
host = self.settings.get('HOST', 'localhost')
port = int(self.settings.get('PORT', 3306))
user = self.settings.get('USER', 'root')
password = self.settings.get('PASSWORD', '123456')
try:
database = self.databaseName
except:
database = self.settings.get('DATABASE', '')
if type == 'mysql':
connect = pymysql.connect(host=host, port=port, db=database, user=user, passwd=password, charset='utf8')
else:
connect = pymssql.connect(host=host, user=user, password=password, database=database)
return connect
# 断表是否存在
def table_exists(self, cursor, table_name):
cursor.execute("show tables;")
tables = [cursor.fetchall()]
table_list = re.findall('('.*?')',str(tables))
table_list = [re.sub("'",'',each) for each in table_list]
if table_name in table_list:
return 1
else:
return 0
# 数据缓存源
def temp_data(self):
connect = self.db_connect()
cursor = connect.cursor()
sql = '''
insert into `sightinfo`(
id
,poiname
,imgurl
,commentscore
,commentcount
,heatscore
,jiage
,districtname
,zonename
,sightcategoryinfo
,sightlevel
,features
,tagname
,detailurl
)
select
id
,poiname
,imgurl
,commentscore
,commentcount
,heatscore
,jiage
,districtname
,zonename
,sightcategoryinfo
,sightlevel
,features
,tagname
,detailurl
from `qeo23287_sightinfo`
where(not exists (select
id
,poiname
,imgurl
,commentscore
,commentcount
,heatscore
,jiage
,districtname
,zonename
,sightcategoryinfo
,sightlevel
,features
,tagname
,detailurl
from `sightinfo` where
`sightinfo`.id=`qeo23287_sightinfo`.id
))
order by rand()
limit 50;
'''
cursor.execute(sql)
connect.commit()
connect.close()