1.1、map与mapPartitions
- map:一次处理一条数据
- mapPartitions:一次处理一个分区的数据
package com.strivelearn.scala
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author strivelearn
* @version MapPartitionsOpScala.java, 2022年11月27日
*/
object MapPartitionsOpScala {
def main(args: Array[String]): Unit = {
//创建SparkContext
val conf = new SparkConf()
conf.setAppName("MapPartitionsOpScala")
.setMaster("local")
val context = new SparkContext(conf)
val dataRDD = context.parallelize(Array(1, 2, 3, 4, 5), 2)
//map算子一次处理一条数据
//所以会打印5个
val sum = dataRDD.map(item => {
println("================================")
item * 2
}).reduce(_ + _)
println(sum)
context.stop()
}
}
mapPartitions
package com.strivelearn.scala
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
/**
* @author strivelearn
* @version MapPartitionsOpScala.java, 2022年11月27日
*/
object MapPartitionsOpScala {
def main(args: Array[String]): Unit = {
//创建SparkContext
val conf = new SparkConf()
conf.setAppName("MapPartitionsOpScala")
.setMaster("local")
val context = new SparkContext(conf)
val dataRDD = context.parallelize(Array(1, 2, 3, 4, 5), 2)
val sum = dataRDD.mapPartitions(item => {
//只打印2次,因为有2个分区
println("================================")
val result = new ArrayBuffer[Int]()
item.foreach(i => {
result.+=(i * 2)
})
result.toIterator
}).reduce(_ + _)
println(sum)
context.stop()
}
}
- 创建数据库链接,使用mapPartitions,放在mapPartitions内部
- 可以减少数据库的链接,提高数据库的性能
Java
package com.strivelearn.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* @author strivelearn
* @version MapPartitionsOpJava.java, 2022年11月27日
*/
public class MapPartitionsOpJava {
public static void main(String[] args) {
//1.创建sparkContext
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("MapPartitionsOpJava");
sparkConf.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
//创建集合
List<Integer> integers = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> rdd = javaSparkContext.parallelize(integers, 2);
Integer sum = rdd.mapPartitions(it -> {
ArrayList<Integer> list = new ArrayList<>();
while (it.hasNext()) {
list.add(it.next() * 2);
}
return list.iterator();
}).reduce((x, y) -> x + y);
System.out.println(sum);
javaSparkContext.stop();
}
}
二、foreach与foreachPartition
- foreach:一次处理一条数据
- foreachPartition:一次处理一个分区的数据
三、repartition的使用
对RDD进行重分区
- 可以调整RDD的并行度
- 可以解决RDD中数据倾斜的问题
package com.strivelearn.scala
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author strivelearn
* @version RepartitionOpScala.java, 2022年11月27日
*/
object RepartitionOpScala {
def main(args: Array[String]): Unit = {
//创建SparkContext
val conf = new SparkConf()
conf.setAppName("RepartitionOpScala")
.setMaster("local")
val context = new SparkContext(conf)
val dataRDD = context.parallelize(Array(1, 2, 3, 4, 5), 2)
dataRDD.repartition(3).foreachPartition(it => {
println("================================")
it.foreach(println(_))
})
context.stop()
}
}
Java
package com.strivelearn.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
/**
* @author strivelearn
* @version RepartitionOpJava.java, 2022年11月27日
*/
public class RepartitionOpJava {
public static void main(String[] args) {
//1.创建sparkContext
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("RepartitionOpJava");
sparkConf.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
//创建集合
List<Integer> integers = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> rdd = javaSparkContext.parallelize(integers, 2);
rdd.repartition(3).foreachPartition(it->{
System.out.println("================================");
while (it.hasNext()){
System.out.println(it.next());
}
});
javaSparkContext.stop();
}
}
四、reduceByKey和groupBykey
Q:在实现分组聚合功能时这两个算子有什么区别?
package com.strivelearn.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
/**
* @author strivelearn
* @version GroupByKeyAndReduceByKeyOp.java, 2022年11月27日
*/
public class GroupByKeyAndReduceByKeyOp {
public static void main(String[] args) {
//1.创建sparkContext
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("GroupByKeyAndReduceByKeyOp").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> dataRDD = javaSparkContext.parallelize(Arrays.asList("hello you", "hello me"));
JavaRDD<String> wordsRDD = dataRDD.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = wordsRDD.mapToPair(tuple -> new Tuple2<>(tuple, 1)).reduceByKey((x, y) -> x + y);
//key you value=1
//key hello value=2
//key me value=1
stringIntegerJavaPairRDD.foreach(tuple -> System.out.println("key " + tuple._1 + " value=" + tuple._2));
javaSparkContext.stop();
}
}
groupByKey
package com.strivelearn.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReference;
/**
* @author xys
* @version GroupByKeyAndReduceByKeyOp.java, 2022年11月27日
*/
public class GroupByKeyAndReduceByKeyOp {
public static void main(String[] args) {
//1.创建sparkContext
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("GroupByKeyAndReduceByKeyOp").setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> dataRDD = javaSparkContext.parallelize(Arrays.asList("hello you", "hello me"));
JavaRDD<String> wordsRDD = dataRDD.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaRDD<Tuple2> map = wordsRDD.mapToPair(tuple -> new Tuple2<>(tuple, 1)).groupByKey().map(wc -> {
Iterable<Integer> integers = wc._2;
AtomicReference<Integer> sum = new AtomicReference<>(0);
integers.forEach(i -> sum.updateAndGet(v -> v + i));
return new Tuple2(wc._1, sum);
});
map.foreach(tuple -> System.out.println("key " + tuple._1 + " value=" + tuple._2));
javaSparkContext.stop();
}
}
- 能用reduceByKey,就用reduceByKey。因为它会在map端,先进行本地combine,可以大大减少要传输到reduce端的数据量,减小网络传输的开销