import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.nio.charset.StandardCharsets;
public class hwztest {
public static void main(String[] args) throws IOException {
//6.Jsoup解析html
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/41/";
Document document ;
FileOutputStream fos=new FileOutputStream("henanstreet.txt");
for(int i=90;i<91;i++){
document = Jsoup.connect(url+"41"+i+".html").get();
// System.out.println(document.getElementsByClass("towntr"));
Elements eles = document.getElementsByClass("countytr");
// Document containerDoc = Jsoup.parse(eles.toString());
// System.out.println(containerDoc);
for(Element countytr :eles){
Element street = countytr.getElementsByAttribute("href").first();
Elements countytds = countytr.getElementsByTag("td");
String areaCode = countytds.get(0).text().substring(0,6);
String streeturl = countytr.getElementsByTag("td").first().getElementsByTag("a").attr("href");
System.out.println(streeturl);
if(StringUtils.isNotBlank(streeturl)){
Document document1 = Jsoup.connect(url+"/"+streeturl).get();
Elements elestreets = document1.getElementsByClass("towntr");
for(Element towntr :elestreets){
Elements tds = towntr.getElementsByTag("td");
//area的街道插入sql
fos.write(("INSERT INTO `business_user_group`(`user_group_id`, `parent_id`, `user_group_name`, `user_group_level`) VALUES ('"+tds.get(0).text()+"', '"+areaCode+"', '"+tds.get(1).text()+"', 5);\n").getBytes(StandardCharsets.UTF_8));
// fos.write(("DELETE FROM `business_user_group` WHERE `user_group_id` = '"+tds.get(0).text()+"';\n").getBytes(StandardCharsets.UTF_8)); }
}
}
}
}
}
将国家统计局河南省下的街道信息读取后,生成对应的mysql插入语句,批量一次性插入数据库。
注:读取完成之后,检查一遍sql语句,有先生僻字会存在字符编码乱码的问题,得自己重新编辑一下