Pyspark

1,136 阅读2分钟

安装anaconda3

# 安装目录
/usr/local/src/anaconda3

# 创建虚拟环境(创建python3.6的虚拟环境)
# 目录在:/usr/local/src/anaconda3/envs
conda create -n dev --copy -y -q python=3.6

# 列出所有虚拟环境
conda env list

# 删除虚拟环境
conda remove -n dev

# 进入虚拟环境
source activate name (或者 conda activate name)
source activate evn
# 升级pip
python -m pip install --upgrade pip

# 进入dev虚拟环境:再安装对应的依赖包
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple jieba

# 退出虚拟环境
source deactivate (或者 conda deactivate)

# pyspark
# 打包环境
zip -r dev.zip dev

安装 jupyter notebook

# 配置密码
jupyter notebook --generate-config
vim ~/.jupyter/jupyter_notebook_config.py

	c.NotebookApp.ip='*'
	c.NotebookApp.password = u'sha1:a4c460523ba6:f73b711d256801d2012efb008f8879b694282669'
	c.NotebookApp.open_browser = False
	c.NotebookApp.port = 7777
# 开启notebook
jupyter notebook --allow-root 1>/dev/null  2>&1  &

netstat -tnlp | grep 7777
kill -9 pid

# 打开ipython生产密码
from notebook.auth import passwd
passwd()
密码:hadoop
sha1:a4c460523ba6:f73b711d256801d2012efb008f8879b694282669

执行pyspark的shell脚本

PYSPARK_PYTHON=./ANACONDA/dev/bin/python
cd $SPARK_HOME
./bin/spark-submit \
    --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./ANACONDA/dev/bin/python \
    --master yarn-cluster \
    --files $HIVE_HOME/conf/hive-site.xml \
    --archives /usr/local/src/anaconda3/envs/dev.zip#ANACONDA \
    /usr/local/src/sub/pyspark/JB.py

pyspark任务

# 结巴分词的JB.py文件
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import jieba

spark = SparkSession.builder.appName("JB Test").enableHiveSupport().getOrCreate()
df = spark.sql("select sentence,label from elaiza.news_noseg limit 10")
df.show()

def seg(text):
    return ' '.join(jieba.cut(text,cut_all=True))

seg_udf = udf(seg,StringType())

df1 = df.withColumn('seg', seg_udf(df.sentence)).select('seg','label')
df1.show()


# 模型代码
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import HashingTF,StringIndexer,Tokenizer,IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import jieba

# 创建sparkSession
spark = SparkSession.builder.appName("NB").enableHiveSupport().getOrCreate()

# 读取hive的数据
df = spark.sql("select sentence,label from elaiza.news_noseg")
df.show()

# 定义结巴切词方法
def seg(text):
    return ' '.join(jieba.cut(text,cut_all=True))
seg_udf = udf(seg, StringType())

# 对数据进行结巴切词
df_seg = df.withColumn('seg',seg_udf(df.sentence)).select('seg','label')
df_seg.show()

# 将分词做成ArrayType()
tokenizer = Tokenizer(inputCol='seg',outputCol='words')
df_seg_arr=tokenizer.transform(df_seg).select('words','label')
df_seg_arr.show()

# 切词之后的文本特征的处理
tf = HashingTF(numFeatures=1<<18,binary=False,inputCol='words',outputCol='rawfeatures')
df_tf = tf.transform(df_seg_arr).select('rawfeatures','label')
df_tf.show()

idf = IDF(inputCol='rawfeatures',outputCol='features')
idfModel = idf.fit(df_tf)
df_tfidf = idfModel.transform(df_tf)
df_tfidf.show()

# label数据的处理
stringIndexer = StringIndexer(inputCol='label',outputCol="indexed", handleInvalid='error')
indexer = stringIndexer.fit(df_tfidf)
df_tfidf_lab = indexer.transform(df_tfidf).select('features','indexed')
df_tfidf_lab.show()

# 切分训练集和预测集
splits = df_tfidf_lab.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# 定义模型
nb = NaiveBayes(featuresCol="features", labelCol="indexed", predictionCol="prediction",
                probabilityCol="probability", rawPredictionCol="rawPrediction",
                smoothing=1.0,
                modelType="multinomial")
# 模型训练
model = nb.fit(train)
# 测试集预测
predictions = model.transform(test)
predictions.show()

# 计算准确率
evaluator = MulticlassClassificationEvaluator(labelCol="indexed",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))