JAVA 简单爬取男女生姓名

255 阅读1分钟
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.URL;  
import java.net.URLConnection;  
import java.util.*;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
public class Test14 {  
   public static void main(String[] args) throws IOException {  
  
   // 获取姓氏url  
   String surnameNet = "https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&from=kg0";  
   //获取男生名字url  
   String boyNet = "http://www.haoming8.cn/baobao/10881.html";  
  //获取女生名字url  
  String girlNet = "http://www.haoming8.cn/baobao/7641.html";  
  
  
   //获取姓氏  
   String surnameContent = getUrlContent(surnameNet);  
   ArrayList<String> surnameList = regularCompilation(surnameContent, "(.{4})(,|。)", 1, "姓氏");  
  
   System.out.println();  
  
   //获取男生名字  
   String boyContent = getUrlContent(boyNet);  
   ArrayList<String> boyNameList = regularCompilation(boyContent, "([\\u4e00-\\u9fa5·]{2})(、)", 1, "男生名字");  
  
   System.out.println();  
  
   //获取女生名字  
   String girlContent = getUrlContent(girlNet);  
   ArrayList<String> girlNameList = regularCompilation(girlContent, "(.. ){4}..", 0, "女生名字");  
  
  System.out.println();  
  
  
   //组合名字  
   combinationName(surnameList, boyNameList, girlNameList, 30, false);  
   
   //写出数据到本地  
    BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter("E:\\Code\\JavaDemo\\src\\CrawlData"));  
    for (String s : linkedHashSet) {  
       bufferedWriter.write(s);  
       bufferedWriter.newLine();  
       } 
    bufferedWriter.close();
}
  
///爬取url内容  
public static String getUrlContent(String net) throws IOException {  
       StringBuilder stringBuilder = new StringBuilder();  
       URL url = new URL(net);  
  
       //关联网站(该网站需保证能在浏览器能正常打开的,不然无法爬取)  
      URLConnection urlConnection = url.openConnection();  
  
      //将字节流转化为字符流  
      InputStreamReader inputStreamReader = new InputStreamReader(urlConnection.getInputStream());  
      int c;  
      while ((c = inputStreamReader.read()) != -1) {  
          stringBuilder.append((char) c);  
       }  
      inputStreamReader.close();  
      return stringBuilder.toString();  
}  
  
///按照指定的正则输出内容  
//参数一:查到到的全部字符  
//参数二:指定的正则规则  
//参数三:需要获取的正则内容下标: 例:(.{4})(,|。) index=0获取全部; index=1 获取内容截取(.{4}); index=2 获取内容后截取(,|。)  
private static ArrayList<String> regularCompilation(String str, String regex, int index, String logTest) {  
     ArrayList<String> arrayList = new ArrayList<>();  
  
    //将给定的正则表达式编译并赋予给Pattern类  
    Pattern pattern = Pattern.compile(regex);  
  
    //创建一个匹配器将匹配给定输入与此模式。  
    Matcher matcher = pattern.matcher(str);  
    while (matcher.find()) {  
        //分组匹配  
        arrayList.add(matcher.group(index));  
    }  
  
    System.out.println("该" + logTest + "为:");  
    System.out.println(arrayList);  
  
    return arrayList;  
}  
  
///组合名字  
//参数一:姓氏列表  
//参数二:要组合的男生名字列表  
//参数三:要组合的女生名字列表  
//参数四:需要组合的名字数量  
//参数五:组合的名字是否为男生  
private static LinkedHashSet<String> combinationName(ArrayList<String> surnameList, ArrayList<String> boyNameList, ArrayList<String> girlNameList, int nameQuantity, boolean isBoy) {  
  
    //创建自带去重且自动排序的集合  
    LinkedHashSet<String> linkedHashSet = new LinkedHashSet<>();  
  
    // 单个姓氏数组  
    // [赵钱孙李, 周吴郑王, 冯陈褚卫, 蒋沈韩杨, 朱秦尤许, 何吕施张, 孔曹严华, 金魏陶姜.....]  
    ArrayList<String> singleSurnameList = new ArrayList<>();  
    for (String s : surnameList) {  
        for (int i = 0; i < s.length(); i++) {  
            singleSurnameList.add(String.valueOf(s.charAt(i)));  
        }  
    }  
  
  
    // 单个女孩名字的列表  
    // [彤舞 芊静 艾丝 惠蕙 语月, 依莹 瑶馨 曼珍 逸云 微婉, 芳诗 逸抒 半烟 花依 绮灵,...]  
    ArrayList<String> singleGirlNameList = new ArrayList<>();  
    for (String s : girlNameList) {  
        String[] names = s.split(" ");  
        for (String name : names) {  
            singleGirlNameList.add(name);  
        }  
    }  
  
    while (linkedHashSet.size() < nameQuantity) {  
     //打乱顺序,每次下标为0的都是不同数据  
    Collections.shuffle(singleSurnameList);  
    if (isBoy) {  
        Collections.shuffle(boyNameList);  
        //组合男生全名  
        linkedHashSet.add(singleSurnameList.get(0) + boyNameList.get(0));  
        } else {  
            Collections.shuffle(singleGirlNameList);  
            //组合女生全名  
            linkedHashSet.add(singleSurnameList.get(0) + singleGirlNameList.get(0));  
        }  
    }  
  
    System.out.println(linkedHashSet);  
  
    return linkedHashSet;  
    }  
}