static String fileName = "test001"
// 统计词频
@Test
public void testWordCount() throws IOException {
//先获得文件的缓冲流 IO方式
BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(fileName))
// NIO方式 不喜欢, 每次读取 都需要清空 byteBuffer 的缓存 比较麻烦 不好控制 , 并且 最终的实现效果差不多
// FileChannel fileChannel = new FileInputStream(fileName).getChannel()
//将数据存放在 bufs 中 byte[] IO
byte[] bufs = new byte[1024 * 4]
// NIO
// ByteBuffer bufs = ByteBuffer.allocate(1024*4)
//
int len = 0
HashMap<String, Integer> hashMap = new HashMap<>()
HashMap<String, Integer> hashMap2 = new HashMap<>()
Long start = System.currentTimeMillis()
//从此输入流中读取最多byte.length个字节的数据到一个字节数组中 IO
while ((len = bufferedInputStream.read(bufs)) != -1) {
/*
* while ((len = fileChannel.read(bufs)) != -1) {
*/
byte[] bytes = Arrays.copyOfRange(bufs, 0, len)
String str = new String(bytes)
//统计在 bytes 中的单词 [可能会出现 不满足五个字符的单词的情况出现,所以这样的统计并不是完全正确的]
HashMap<String, Integer> hash = countByString(str)
for (Map.Entry<String, Integer> entry : hash.entrySet()) {
String key = entry.getKey()
//统计不满一个五个字符单词的数量 hashmap2
if (key.length() < 5) {
incKey(key, hashMap2, entry.getValue())
}
// 把这次统计的数据保存到返回值中
incKey(key, hashMap, entry.getValue())
}
}
System.out.println(hashMap2.keySet().size())
System.out.println("time: " + (System.currentTimeMillis() - start) + "ms")
System.out.println(hashMap.get("ababb"))
System.out.println(hashMap.size())
}
/**
* 统计单词数量
*
* @param str
* @return
*/
public HashMap<String, Integer> countByString(String str) {
HashMap<String, Integer> hashMap = new HashMap<>()
// 分割字符串, 类似于数单词
StringTokenizer stringTokenizer = new StringTokenizer(str, " ")
while (stringTokenizer.hasMoreTokens()) {
String word = stringTokenizer.nextToken()
incKey(word, hashMap, 1)
}
return hashMap
}
/**
* 将相同的str 保存到hashmap 中 , 并统计count
*
* @param word
* @param hashMap
* @param n 每一个代表的个数
*/
public void incKey(String word, HashMap<String, Integer> hashMap, Integer n) {
if (hashMap.containsKey(word)) {
hashMap.put(word, hashMap.get(word) + n)
} else {
hashMap.put(word, n)
}
}
@Test
//统计词频 多线程版本
public void wordCount() throws ExecutionException, InterruptedException {
System.out.println("processors:" + Runtime.getRuntime().availableProcessors())
run(fileName, 1024*1024*10)
}
final ForkJoinPool forkJoinPool = new ForkJoinPool()
// 任务
class countTask implements Callable<HashMap<String, Integer>> {
private final Long start
private final Long end
private final String fileName
public countTask(Long start, Long end, String fileName) {
this.start = start
this.end = end
this.fileName = fileName
}
/**
* Computes a result, or throws an exception if unable to do so.
*
* @return computed result
* @throws Exception if unable to compute a result
*/
@Override
public HashMap<String, Integer> call() throws Exception {
// 创建一个随机访问 文件的流 并返回他的 fileChannel 管道
FileChannel fileChannel = new RandomAccessFile(this.fileName, "rw").getChannel()
// 将此通道文件的一个区域直接映射到内存中。直接返回文件的一段文件的内存映射 即跳过两次拷贝
MappedByteBuffer mbuf = fileChannel.map(FileChannel.MapMode.READ_ONLY, this.start, this.end - this.start)
String str = StandardCharsets.US_ASCII.decode(mbuf).toString()
return countByString(str)
}
}
/**
* @param fileName
* @param chunkSize
*/
public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
File file = new File(fileName)
Long fileSize = file.length()
long position = 0
Long startTime = System.currentTimeMillis()
// 保存每次任务的数据
ArrayList<Future<HashMap<String, Integer>>> tasks = new ArrayList<Future<HashMap<String, Integer>>>()
while (position < fileSize) {
//防止 最后一次任务的 标志 大于 文件的大小
long next = Math.min(fileSize, chunkSize + position)
// 截取 文件, 将文件 分成每个 chunksize 大小 制作成一个task
countTask task = new countTask(position, next, fileName)
position = next
//交由线程池去处理task 并将结果返回至tasks中 里面包裹的是 截取段 词频数据
Future<HashMap<String, Integer>> future = forkJoinPool.submit(task)
tasks.add(future)
}
System.out.format("split to %d tasks\n", tasks.size())
HashMap<String,Integer> total = new HashMap<>()
for(Future<HashMap<String, Integer>> futures : tasks){
HashMap<String,Integer> map = futures.get()
for(Map.Entry<String,Integer> entry:map.entrySet()){
incKey(entry.getKey(),total,entry.getValue())
}
}
System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms")
System.out.println("total:" + total.size())
System.out.println(total.get("ababb"))
}