做GIS开发的看过来,用java获取shapefile的编码

13 阅读2分钟

在 Java 中判断 Shapefile(.shp)的编码(特别是属性表 .dbf 的编码),由于 Shapefile 本身不直接存储编码信息(除非有 .cpg 文件),我们可以采取以下策略:

  1. 优先检查是否存在 .cpg 文件,若有则直接读取其内容;
  2. 若无 .cpg 文件,则尝试用常见编码(如 UTF-8、GBK)读取 .dbf 文件的字段名或记录内容,通过是否出现乱码或解码异常来判断。

Java 生态中常用读取 Shapefile 的库是 GeoTools。下面提供一个完整的 Java 示例代码,使用 GeoTools 实现自动检测编码。具体代码实现如下:

import org.geotools.data.shapefile.ShapefileDataStore;
import org.geotools.data.simple.SimpleFeatureCollection;
import org.geotools.data.simple.SimpleFeatureIterator;
import org.geotools.data.simple.SimpleFeatureSource;
import org.opengis.feature.simple.SimpleFeature;
import org.opengis.feature.simple.SimpleFeatureType;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

public class ShapefileEncodingDetector {

    private static final List<String> CANDIDATE_ENCODINGS = Arrays.asList("UTF-8", "GBK", "GB2312");
    private static final Pattern CHINESE_PATTERN = Pattern.compile("[\u4e00-\u9fff]");
    private static final Pattern OBVIOUS_GARBAGE_PATTERN = Pattern.compile(".*[\?]{2,}.*|[\x00-\x08\x0B\x0C\x0E-\x1F].*");

    /**
     * 严谨检测 Shapefile 编码(JDK 8 兼容)
     */
    public static String detectEncoding(String shpFilePath) {
        Path shpPath = Paths.get(shpFilePath);
        if (!shpPath.toString().toLowerCase().endsWith(".shp")) {
            shpPath = Paths.get(shpFilePath + ".shp");
        }

        // Step 1: Check .cpg file (JDK 8 compatible read)
        Path cpgPath = shpPath.resolveSibling(shpPath.getFileName().toString().replace(".shp", ".cpg"));
        if (Files.exists(cpgPath)) {
            String encodingFromCpg = readCpgFile(cpgPath);
            if (encodingFromCpg != null && Charset.isSupported(encodingFromCpg)) {
                System.out.println("[INFO] Encoding from .cpg: " + encodingFromCpg);
                return encodingFromCpg;
            }
        }

        String bestEncoding = null;

        // Step 2: Try candidate encodings
        for (String encoding : CANDIDATE_ENCODINGS) {
            try {
                ShapefileDataStore store = new ShapefileDataStore(shpPath.toUri().toURL());
                store.setCharset(Charset.forName(encoding));

                SimpleFeatureSource source = store.getFeatureSource();
                SimpleFeatureType schema = source.getSchema();

                boolean valid = true;
                boolean currentHasChinese = false;

                // Validate field names
                for (int i = 0; i < schema.getAttributeCount(); i++) {
                    String fieldName = schema.getDescriptor(i).getLocalName();
                    if (isObviousGarbage(fieldName)) {
                        valid = false;
                        break;
                    }
                    if (containsChinese(fieldName)) {
                        currentHasChinese = true;
                    }
                }

                if (!valid) {
                    store.dispose();
                    continue;
                }

                // Validate first few feature attribute values
                SimpleFeatureCollection features = source.getFeatures();
                SimpleFeatureIterator iter = features.features();
                try {
                    int count = 0;
                    while (iter.hasNext() && count < 5) {
                        SimpleFeature feature = iter.next();
                        for (Object attr : feature.getAttributes()) {
                            if (attr == null) continue;
                            String value = attr.toString();
                            if (isObviousGarbage(value)) {
                                valid = false;
                                break;
                            }
                            if (containsChinese(value)) {
                                currentHasChinese = true;
                            }
                        }
                        if (!valid) break;
                        count++;
                    }
                } finally {
                    iter.close(); // JDK 8 没有 try-with-resources for AutoCloseable in older GeoTools?
                }

                store.dispose();

                if (valid) {
                    if (currentHasChinese) {
                        System.out.println("[INFO] Valid encoding with Chinese detected: " + encoding);
                        return encoding;
                    } else {
                        if (bestEncoding == null) {
                            bestEncoding = encoding;
                        }
                    }
                }

            } catch (Exception e) {
                // Skip this encoding
                continue;
            }
        }

        if (bestEncoding != null) {
            System.out.println("[INFO] No Chinese found, using safe fallback: " + bestEncoding);
            return bestEncoding;
        }

        System.out.println("[WARN] All encodings failed, defaulting to UTF-8");
        return "UTF-8";
    }

    /**
     * JDK 8 兼容方式读取 .cpg 文件(纯文本,单行编码名)
     */
    private static String readCpgFile(Path cpgPath) {
        try {
            // 使用默认字符集(通常是 UTF-8)读取 .cpg,因为 .cpg 本身是 ASCII/UTF-8
            StringBuilder content = new StringBuilder();
            try (BufferedReader reader = Files.newBufferedReader(cpgPath)) {
                String line;
                while ((line = reader.readLine()) != null) {
                    content.append(line.trim());
                    break; // .cpg 通常只有一行
                }
            }
            return content.toString().trim();
        } catch (IOException e) {
            System.err.println("[WARN] Failed to read .cpg file: " + e.getMessage());
            return null;
        }
    }

    /**
     * 判断字符串是否包含中文字符
     */
    private static boolean containsChinese(String s) {
        return s != null && CHINESE_PATTERN.matcher(s).find();
    }

    /**
     * 判断是否为明显乱码
     */
    private static boolean isObviousGarbage(String s) {
        if (s == null || s.isEmpty()) return false;

        if (OBVIOUS_GARBAGE_PATTERN.matcher(s).matches()) {
            return true;
        }

        long badCharCount = 0;
        for (char c : s.toCharArray()) {
            if (c == '?' || c == '\ufffd') {
                badCharCount++;
            }
        }

        return badCharCount >= 2 || (s.length() > 0 && (double) badCharCount / s.length() > 0.3);
    }