JAVA - 词频统计

492 阅读1分钟
  	static String fileName = "test001";
  // 统计词频
@Test
public void testWordCount() throws IOException {
	//先获得文件的缓冲流 IO方式
	BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(fileName));

	// NIO方式  不喜欢, 每次读取 都需要清空 byteBuffer 的缓存  比较麻烦 不好控制 , 并且 最终的实现效果差不多
    //        FileChannel fileChannel =  new FileInputStream(fileName).getChannel();
	//将数据存放在 bufs 中 byte[] IO
	byte[] bufs = new byte[1024 * 4];

	// NIO
        //        ByteBuffer bufs = ByteBuffer.allocate(1024*4);


	//
	int len = 0;
	HashMap<String, Integer> hashMap = new HashMap<>();
	HashMap<String, Integer> hashMap2 = new HashMap<>();
	Long start = System.currentTimeMillis();
	//从此输入流中读取最多byte.length个字节的数据到一个字节数组中 IO
	while ((len = bufferedInputStream.read(bufs)) != -1) {
/*
 *   while ((len = fileChannel.read(bufs)) != -1) {
*/
		byte[] bytes = Arrays.copyOfRange(bufs, 0, len);
		String str = new String(bytes);
                       //统计在 bytes 中的单词  [可能会出现 不满足五个字符的单词的情况出现,所以这样的统计并不是完全正确的]
		HashMap<String, Integer> hash = countByString(str);
		for (Map.Entry<String, Integer> entry : hash.entrySet()) {
			String key = entry.getKey();
			//统计不满一个五个字符单词的数量 hashmap2
			if (key.length() < 5) {
				incKey(key, hashMap2, entry.getValue());
			}
			// 把这次统计的数据保存到返回值中
			incKey(key, hashMap, entry.getValue());
		}

	}

	System.out.println(hashMap2.keySet().size());
	System.out.println("time: " + (System.currentTimeMillis() - start) + "ms");
	System.out.println(hashMap.get("ababb"));
	System.out.println(hashMap.size());
}


/**
 * 统计单词数量
 *
 * @param str
 * @return
 */
public HashMap<String, Integer> countByString(String str) {
	HashMap<String, Integer> hashMap = new HashMap<>();
	// 分割字符串, 类似于数单词
	StringTokenizer stringTokenizer = new StringTokenizer(str, " ");

	while (stringTokenizer.hasMoreTokens()) {
		String word = stringTokenizer.nextToken();
		incKey(word, hashMap, 1);
	}

	return hashMap;
}

/**
 * 将相同的str 保存到hashmap 中 , 并统计count
 *
 * @param word
 * @param hashMap
 * @param n       每一个代表的个数
 */
public void incKey(String word, HashMap<String, Integer> hashMap, Integer n) {
	if (hashMap.containsKey(word)) {
		hashMap.put(word, hashMap.get(word) + n);
	} else {
		hashMap.put(word, n);
	}
}


@Test
//统计词频 多线程版本
public void wordCount() throws ExecutionException, InterruptedException {

	System.out.println("processors:" + Runtime.getRuntime().availableProcessors());
	run(fileName, 1024*1024*10);
}

final ForkJoinPool forkJoinPool = new ForkJoinPool();


// 任务
class countTask implements Callable<HashMap<String, Integer>> {
	private final Long start;
	private final Long end;
	private final String fileName;

	public countTask(Long start, Long end, String fileName) {
		this.start = start;
		this.end = end;
		this.fileName = fileName;
	}

	/**
	 * Computes a result, or throws an exception if unable to do so.
	 *
	 * @return computed result
	 * @throws Exception if unable to compute a result
	 */
	@Override
	public HashMap<String, Integer> call() throws Exception {
		// 创建一个随机访问 文件的流 并返回他的 fileChannel 管道
		FileChannel fileChannel = new RandomAccessFile(this.fileName, "rw").getChannel();
		// 将此通道文件的一个区域直接映射到内存中。直接返回文件的一段文件的内存映射    即跳过两次拷贝 
		MappedByteBuffer mbuf = fileChannel.map(FileChannel.MapMode.READ_ONLY, this.start, this.end - this.start);

		String str = StandardCharsets.US_ASCII.decode(mbuf).toString();
		return countByString(str);
	}
}

/**
 * @param fileName
 * @param chunkSize
 */
public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
	File file = new File(fileName);
	Long fileSize = file.length();

	long position = 0;

	Long startTime = System.currentTimeMillis();
	// 保存每次任务的数据
	ArrayList<Future<HashMap<String, Integer>>> tasks = new ArrayList<Future<HashMap<String, Integer>>>();
	while (position < fileSize) {
		//防止 最后一次任务的 标志 大于 文件的大小
		long next = Math.min(fileSize, chunkSize + position);
		// 截取 文件, 将文件 分成每个 chunksize 大小 制作成一个task
		countTask task = new countTask(position, next, fileName);
		position = next;
		//交由线程池去处理task 并将结果返回至tasks中  里面包裹的是 截取段 词频数据
		Future<HashMap<String, Integer>> future = forkJoinPool.submit(task);
		tasks.add(future);
	}
	System.out.format("split to %d tasks\n", tasks.size());

	HashMap<String,Integer> total = new HashMap<>();

	for(Future<HashMap<String, Integer>> futures : tasks){
		HashMap<String,Integer> map = futures.get();
		for(Map.Entry<String,Integer> entry:map.entrySet()){
			incKey(entry.getKey(),total,entry.getValue());
		}
	}
	System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms");
	System.out.println("total:" + total.size());

	System.out.println(total.get("ababb"));
}