问题:
今天收到一个需求:有大量的表格,格式不确定,里面有一列是电话号码,在哪一列不确定,需要对比哪些表格数据是相同的。表格的格式有csv,xls,xlsx,txt,编码格式有gbk,utf-8,utf-16,看第二堆里面有多少个第一堆里的,里面各种稀奇古怪的数据,只有电话号码是基本上每个表格都有的,从这个方向入手的。
1.定义好手机号码和固定电话的正则表达
2.定义好读取text 文件 并按空格分组
3.定义好读取excel文件
4.使用Files.walk 遍历读取该文件夹中所有的文件并且以正确的格式读取
5.打印出相同的数据并按组分类
源码:
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;
import java.util.regex.Pattern;
public class PhoneNumberExtractor {
private static final Pattern MOBILE_PATTERN = Pattern.compile("^1[3-9]\\d{9}$");
private static final Pattern LANDLINE_PATTERN = Pattern.compile("^(0\\d{2,3}-?)?\\d{7,8}$");
public static boolean isMobilePhoneNumber(String phoneNumber) {
return MOBILE_PATTERN.matcher(phoneNumber).matches();
}
public static boolean isLandlinePhoneNumber(String phoneNumber) {
return LANDLINE_PATTERN.matcher(phoneNumber).matches();
}
public static boolean check(String phoneNumber) {
return isLandlinePhoneNumber(phoneNumber) || isMobilePhoneNumber(phoneNumber);
}
public static void main(String[] args) throws IOException {
System.out.println("---------------开始-------------------");
String folderPath1 = "C:\\docmentHistory\\y";
String folderPath2 = "C:\\docmentHistory\\h";
HashSet<String> phoneNumbersFolder1 = new HashSet<>();
HashSet<String> phoneNumbersFolder2 = new HashSet<>();
Map<String, Set<String>> phoneNumbersMap1 = new HashMap<>();
Map<String, Set<String>> phoneNumbersMap2 = new HashMap<>();
// 读取第一个文件夹中的电话号码
processFilesFromFolder(folderPath1, phoneNumbersFolder1,phoneNumbersMap1);
// 读取第二个文件夹中的电话号码
processFilesFromFolder(folderPath2, phoneNumbersFolder2,phoneNumbersMap2);
// 检查第二个文件夹中的电话号码是否存在于第一个文件夹中
for (String number : phoneNumbersFolder2) {
if (phoneNumbersFolder1.contains(number)) {
System.out.println("------------------------------------------");
System.out.println("\r\n电话号码 " + number + " 第一个文件夹中存在文件名为:"+phoneNumbersMap1.get(number));
System.out.println("电话号码 " + number + " 第二个文件夹中存在文件名为:"+phoneNumbersMap2.get(number)+"\r\n");
System.out.println("++++++++++++++++++++++++++++++++++++++++++");
}
}
System.out.println("---------------结束-------------------");
}
private static void processFilesFromFolder(String folderPath, HashSet<String> phoneNumbers,Map<String, Set<String>> phoneNumbersMap) throws IOException {
Files.walk(Paths.get(folderPath))
.filter(Files::isRegularFile)
.forEach(file -> {
String fileName = file.toString();
try {
if (fileName.endsWith(".csv") || fileName.endsWith(".txt")) {
readTextFile(fileName, phoneNumbers, StandardCharsets.UTF_8,phoneNumbersMap); // 或其他编码
} else if (fileName.endsWith(".xls")) {
readExcelFile(fileName, phoneNumbers, false,phoneNumbersMap);
} else if (fileName.endsWith(".xlsx")) {
readExcelFile(fileName, phoneNumbers, true,phoneNumbersMap);
}
} catch (IOException e) {
e.printStackTrace();
}
});
}
private static void readTextFile(String filePath, HashSet<String> phoneNumbers, Charset charset,Map<String, Set<String>> phoneNumbersMap) throws IOException {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), charset))) {
String line;
while ((line = reader.readLine()) != null) {
String[] groups = line.trim().split("\\s+");
for (String group : groups) {
if (check(group)) {
phoneNumbers.add(group);
phoneNumbersMap.computeIfAbsent(group,v->new HashSet<>()).add(filePath);
}
}
}
}
}
private static void readExcelFile(String filePath, HashSet<String> phoneNumbers, boolean isXlsx,Map<String, Set<String>> phoneNumbersMap) throws IOException {
try (InputStream inp = new FileInputStream(filePath)) {
Workbook workbook = isXlsx ? new XSSFWorkbook(inp) : new HSSFWorkbook(inp);
for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
Sheet sheet = workbook.getSheetAt(sheetNum);
for (Row row : sheet) {
for (Cell cell : row) {
if (cell.getCellType() == CellType.STRING || cell.getCellType() == CellType.NUMERIC) {
if(check(cell.getStringCellValue())){
phoneNumbers.add(cell.getStringCellValue());
phoneNumbersMap.computeIfAbsent(cell.getStringCellValue(),v->new HashSet<>()).add(filePath);
}
}
}
}
}
}
}
}
总结:
- 本程序使用Files.walk方法递归遍历文件夹中的所有文件。