场景题:有40亿个QQ号如何去重?仅1GB内存场景题也有一些套路可以考虑,比如去重、判断给定数据是否存在 1.大数据去重 - 掘金
随机生成QQ号
import random
def get_num(bit):
random_num = ''.join([
str(random.randint(0, 9)) if i > 0
else str(random.randint(1, 9))
for i in range(bit)])
# print(random_num)
return random_num
def write_2_file(f, bit, count):
for i in range(count):
f.write(get_num(bit) + "\n")
if __name__ == "__main__":
with open('qq2.txt', 'w') as f:
write_2_file(f, 6, 100000)
write_2_file(f, 7, 1000000)
write_2_file(f, 8, 10000000)
write_2_file(f, 9, 100000000)
print("--")
write_2_file(f, 10, 1000000000)
校验
package org.example;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import java.io.File;
import java.io.IOException;
import java.util.BitSet;
// 需调整JVM参数 -Xmx1g
public class BitMapDemo {
private static final long INT_UNSIGN_MAX = ((long) Integer.MAX_VALUE) * 2 + 1;
public static void main(String[] args) {
//初始化长度为2 ^ 32位的位数组
BitSet bitmap = new BitSet((int) (1L << 32));
//读取QQ号,如果该位为0,标记为1;否则数据重复
try (LineIterator it = FileUtils.lineIterator(new File("qq2.txt"), "UTF-8")) {
while (it.hasNext()) {
String line = it.next();
if (line.compareTo(String.valueOf(INT_UNSIGN_MAX)) > 0) continue;
long qq = Long.parseLong(line);
System.out.println(qq);
if (!bitmap.get((int) qq)) {
//数据不存在才set 1,存在则去重了
bitmap.set((int) qq);
} else {
System.out.println(qq + " 重复");
}
//最后,遍历Bitmap位数组,标记为1的位置就是去重后的结果了
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
遇到的问题,当数字大于 Integer.MAX_VALUE = 2147483647 时会报错:
Exception in thread "main" java.lang.IndexOutOfBoundsException: bitIndex < 0: -1481481763
at java.base/java.util.BitSet.get(BitSet.java:626)
at org.example.BitMapDemo.main(BitMapDemo.java:24)
因为BitSet的set方法并不支持输入负数
public void set(int bitIndex) {
if (bitIndex < 0)
throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
int wordIndex = wordIndex(bitIndex);
expandTo(wordIndex);
words[wordIndex] |= (1L << bitIndex); // Restores invariants
checkInvariants();
}
所以,BitSet 目前好像只能支持 Integer.MAX_VALUE 个数