Java 实战-比较两个文件夹中所有的文件是否存在相同电话号码

70 阅读2分钟

问题:

今天收到一个需求:有大量的表格,格式不确定,里面有一列是电话号码,在哪一列不确定,需要对比哪些表格数据是相同的。表格的格式有csv,xls,xlsx,txt,编码格式有gbk,utf-8,utf-16,看第二堆里面有多少个第一堆里的,里面各种稀奇古怪的数据,只有电话号码是基本上每个表格都有的,从这个方向入手的。

1.定义好手机号码和固定电话的正则表达

企业微信截图_17131748777961.png

2.定义好读取text 文件 并按空格分组

企业微信截图_17131750113914.png

3.定义好读取excel文件

企业微信截图_17131750471529.png

4.使用Files.walk 遍历读取该文件夹中所有的文件并且以正确的格式读取

企业微信截图_17131750673710.png

5.打印出相同的数据并按组分类

企业微信截图_17131751647755.png

源码:

import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;
import java.util.regex.Pattern;

public class PhoneNumberExtractor {

    private static final Pattern MOBILE_PATTERN = Pattern.compile("^1[3-9]\\d{9}$");
    private static final Pattern LANDLINE_PATTERN = Pattern.compile("^(0\\d{2,3}-?)?\\d{7,8}$");


    public static boolean isMobilePhoneNumber(String phoneNumber) {
        return MOBILE_PATTERN.matcher(phoneNumber).matches();
    }

    public static boolean isLandlinePhoneNumber(String phoneNumber) {
        return LANDLINE_PATTERN.matcher(phoneNumber).matches();
    }

    public static boolean check(String phoneNumber) {
        return isLandlinePhoneNumber(phoneNumber) || isMobilePhoneNumber(phoneNumber);
    }
    public static void main(String[] args) throws IOException {
        System.out.println("---------------开始-------------------");
        String folderPath1 = "C:\\docmentHistory\\y";
        String folderPath2 = "C:\\docmentHistory\\h";

        HashSet<String> phoneNumbersFolder1 = new HashSet<>();
        HashSet<String> phoneNumbersFolder2 = new HashSet<>();

        Map<String, Set<String>> phoneNumbersMap1 = new HashMap<>();

        Map<String, Set<String>> phoneNumbersMap2 = new HashMap<>();

        // 读取第一个文件夹中的电话号码
        processFilesFromFolder(folderPath1, phoneNumbersFolder1,phoneNumbersMap1);

        // 读取第二个文件夹中的电话号码
        processFilesFromFolder(folderPath2, phoneNumbersFolder2,phoneNumbersMap2);

        // 检查第二个文件夹中的电话号码是否存在于第一个文件夹中
        for (String number : phoneNumbersFolder2) {
            if (phoneNumbersFolder1.contains(number)) {
                System.out.println("------------------------------------------");
                System.out.println("\r\n电话号码 " + number + " 第一个文件夹中存在文件名为:"+phoneNumbersMap1.get(number));
                System.out.println("电话号码 " + number + " 第二个文件夹中存在文件名为:"+phoneNumbersMap2.get(number)+"\r\n");
                System.out.println("++++++++++++++++++++++++++++++++++++++++++");
            }
        }
        System.out.println("---------------结束-------------------");
    }

    private static void processFilesFromFolder(String folderPath, HashSet<String> phoneNumbers,Map<String, Set<String>> phoneNumbersMap) throws IOException {
        Files.walk(Paths.get(folderPath))
                .filter(Files::isRegularFile)
                .forEach(file -> {
                    String fileName = file.toString();
                    try {
                        if (fileName.endsWith(".csv") || fileName.endsWith(".txt")) {
                            readTextFile(fileName, phoneNumbers, StandardCharsets.UTF_8,phoneNumbersMap); // 或其他编码
                        } else if (fileName.endsWith(".xls")) {
                            readExcelFile(fileName, phoneNumbers, false,phoneNumbersMap);
                        } else if (fileName.endsWith(".xlsx")) {
                            readExcelFile(fileName, phoneNumbers, true,phoneNumbersMap);
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                });
    }

    private static void readTextFile(String filePath, HashSet<String> phoneNumbers, Charset charset,Map<String, Set<String>> phoneNumbersMap) throws IOException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), charset))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] groups = line.trim().split("\\s+");
                for (String group : groups) {
                    if (check(group)) {
                        phoneNumbers.add(group);
                        phoneNumbersMap.computeIfAbsent(group,v->new HashSet<>()).add(filePath);
                    }
                }
            }
        }
    }

    private static void readExcelFile(String filePath, HashSet<String> phoneNumbers, boolean isXlsx,Map<String, Set<String>> phoneNumbersMap) throws IOException {
        try (InputStream inp = new FileInputStream(filePath)) {
            Workbook workbook = isXlsx ? new XSSFWorkbook(inp) : new HSSFWorkbook(inp);
            for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
                Sheet sheet = workbook.getSheetAt(sheetNum);
                for (Row row : sheet) {
                    for (Cell cell : row) {
                        if (cell.getCellType() == CellType.STRING || cell.getCellType() == CellType.NUMERIC) {
                            if(check(cell.getStringCellValue())){
                                phoneNumbers.add(cell.getStringCellValue());
                                phoneNumbersMap.computeIfAbsent(cell.getStringCellValue(),v->new HashSet<>()).add(filePath);
                            }
                        }
                    }
                }
            }
        }
    }
}


总结:

- 本程序使用Files.walk方法递归遍历文件夹中的所有文件。