关于有40亿个QQ号如何去重的实践

9 阅读1分钟

场景题:有40亿个QQ号如何去重?仅1GB内存场景题也有一些套路可以考虑,比如去重、判断给定数据是否存在 1.大数据去重 - 掘金

随机生成QQ号

import random


def get_num(bit):
    random_num = ''.join([
        str(random.randint(0, 9)) if i > 0
        else str(random.randint(1, 9))
        for i in range(bit)])
    # print(random_num)
    return random_num


def write_2_file(f, bit, count):
    for i in range(count):
        f.write(get_num(bit) + "\n")


if __name__ == "__main__":
    with open('qq2.txt', 'w') as f:
        write_2_file(f, 6, 100000)
        write_2_file(f, 7, 1000000)
        write_2_file(f, 8, 10000000)
        write_2_file(f, 9, 100000000)
        print("--")
        write_2_file(f, 10, 1000000000)

校验

package org.example;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;

import java.io.File;
import java.io.IOException;
import java.util.BitSet;

// 需调整JVM参数 -Xmx1g
public class BitMapDemo {

    private static final long INT_UNSIGN_MAX = ((long) Integer.MAX_VALUE) * 2 + 1;

    public static void main(String[] args) {
        //初始化长度为2 ^ 32位的位数组
        BitSet bitmap = new BitSet((int) (1L << 32)); 
        //读取QQ号,如果该位为0,标记为1;否则数据重复
        try (LineIterator it = FileUtils.lineIterator(new File("qq2.txt"), "UTF-8")) {
            while (it.hasNext()) {
                String line = it.next();
                if (line.compareTo(String.valueOf(INT_UNSIGN_MAX)) > 0) continue;
                long qq = Long.parseLong(line);
                System.out.println(qq);
                if (!bitmap.get((int) qq)) {
                    //数据不存在才set 1,存在则去重了
                    bitmap.set((int) qq);
                } else {
                    System.out.println(qq + " 重复");
                }
                //最后,遍历Bitmap位数组,标记为1的位置就是去重后的结果了
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

遇到的问题,当数字大于 Integer.MAX_VALUE = 2147483647 时会报错:

Exception in thread "main" java.lang.IndexOutOfBoundsException: bitIndex < 0: -1481481763
	at java.base/java.util.BitSet.get(BitSet.java:626)
	at org.example.BitMapDemo.main(BitMapDemo.java:24)

因为BitSet的set方法并不支持输入负数

public void set(int bitIndex) {
    if (bitIndex < 0)
        throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);

    int wordIndex = wordIndex(bitIndex);
    expandTo(wordIndex);

    words[wordIndex] |= (1L << bitIndex); // Restores invariants

    checkInvariants();
}

所以,BitSet 目前好像只能支持 Integer.MAX_VALUE 个数