安装anaconda3
/usr/local/src/anaconda3
conda create -n dev --copy -y -q python=3.6
conda env list
conda remove -n dev
source activate name (或者 conda activate name)
source activate evn
python -m pip install --upgrade pip
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple jieba
source deactivate (或者 conda deactivate)
zip -r dev.zip dev
安装 jupyter notebook
jupyter notebook --generate-config
vim ~/.jupyter/jupyter_notebook_config.py
c.NotebookApp.ip='*'
c.NotebookApp.password = u'sha1:a4c460523ba6:f73b711d256801d2012efb008f8879b694282669'
c.NotebookApp.open_browser = False
c.NotebookApp.port = 7777
jupyter notebook --allow-root 1>/dev/null 2>&1 &
netstat -tnlp | grep 7777
kill -9 pid
from notebook.auth import passwd
passwd()
密码:hadoop
sha1:a4c460523ba6:f73b711d256801d2012efb008f8879b694282669
执行pyspark的shell脚本
PYSPARK_PYTHON=./ANACONDA/dev/bin/python
cd $SPARK_HOME
./bin/spark-submit \
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./ANACONDA/dev/bin/python \
--master yarn-cluster \
--files $HIVE_HOME/conf/hive-site.xml \
--archives /usr/local/src/anaconda3/envs/dev.zip#ANACONDA \
/usr/local/src/sub/pyspark/JB.py
pyspark任务
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import jieba
spark = SparkSession.builder.appName("JB Test").enableHiveSupport().getOrCreate()
df = spark.sql("select sentence,label from elaiza.news_noseg limit 10")
df.show()
def seg(text):
return ' '.join(jieba.cut(text,cut_all=True))
seg_udf = udf(seg,StringType())
df1 = df.withColumn('seg', seg_udf(df.sentence)).select('seg','label')
df1.show()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import HashingTF,StringIndexer,Tokenizer,IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import jieba
spark = SparkSession.builder.appName("NB").enableHiveSupport().getOrCreate()
df = spark.sql("select sentence,label from elaiza.news_noseg")
df.show()
def seg(text):
return ' '.join(jieba.cut(text,cut_all=True))
seg_udf = udf(seg, StringType())
df_seg = df.withColumn('seg',seg_udf(df.sentence)).select('seg','label')
df_seg.show()
tokenizer = Tokenizer(inputCol='seg',outputCol='words')
df_seg_arr=tokenizer.transform(df_seg).select('words','label')
df_seg_arr.show()
tf = HashingTF(numFeatures=1<<18,binary=False,inputCol='words',outputCol='rawfeatures')
df_tf = tf.transform(df_seg_arr).select('rawfeatures','label')
df_tf.show()
idf = IDF(inputCol='rawfeatures',outputCol='features')
idfModel = idf.fit(df_tf)
df_tfidf = idfModel.transform(df_tf)
df_tfidf.show()
stringIndexer = StringIndexer(inputCol='label',outputCol="indexed", handleInvalid='error')
indexer = stringIndexer.fit(df_tfidf)
df_tfidf_lab = indexer.transform(df_tfidf).select('features','indexed')
df_tfidf_lab.show()
splits = df_tfidf_lab.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]
nb = NaiveBayes(featuresCol="features", labelCol="indexed", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction",
smoothing=1.0,
modelType="multinomial")
model = nb.fit(train)
predictions = model.transform(test)
predictions.show()
evaluator = MulticlassClassificationEvaluator(labelCol="indexed",
predictionCol="prediction",
metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))