java爬取国家行政区代码数据
-
- import cn.hutool.core.util.StrUtil;
- import cn.hutool.http.HttpRequest;
- import cn.hutool.json.JSONArray;
- import cn.hutool.json.JSONObject;
- import cn.hutool.json.JSONUtil;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
-
- import java.util.*;
-
- /**
- * @author glgom
- * @title RegionCrawler
- * @description 国家行政区数据爬虫,数据来源于国家统计局
- */
-
- public class RegionCrawler {
- /** 行政区代码数据存在年度 */
- private static String periodUrl = "http://www.stats.gov.cn/sj/tjbz/qhdm/";
- /** 爬取数据网址 */
- private static String provinceUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/index.html";
-
- private static JSONArray dataList = new JSONArray();
-
- public static void main(String[] args) {
- // analysePeriod();
- analyseProvince("2023");
- }
-
- public static void analysePeriod(){
- String html = HttpRequest.get(periodUrl).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements lis = pDoc.select(".list-content ul li");
- List
periods = new ArrayList<>(); - for(int i=0; i
- Element e = lis.get(i);
- Elements arr = e.select("a");
- String period = arr.get(0).text().replace("年", "");
- periods.add(period);
- }
- Collections.reverse(periods); // 反转年度
- for(String period: periods){
- dataList.clear(); //清理数据,每年
- analyseProvince(period);
- }
- }
-
- /**
- * 省
- */
- public static void analyseProvince(String period){
- // String provinceUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html";
- Map
formatMap = new HashMap(); - formatMap.put("period", period);
- String url = StrUtil.format(provinceUrl, formatMap);
- String html = HttpRequest.get(url).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements arr = pDoc.select(".provincetable .provincetr td a");
- for(Element e: arr){
- String href = e.attr("href");
- String name = e.text();
- String code = href.substring(0, href.indexOf(".html"));
- JSONObject data = formatData(name, code, "中国", "1", RegionType.PROVINCE.code, RegionType.PROVINCE.name, period);
- dataList.add(data);
- System.out.println(data.toString());
- JSONObject temp = new JSONObject(data);
- temp.set("href", href);
- temp.set("url", url);
- analyseCity(temp);
- }
- System.out.println(dataList.toString());
- }
-
- /**
- * 地市级
- */
- public static void analyseCity(JSONObject p){
- // String cityUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11.html";
- // String cityUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{href}";
- String pUrl = p.getStr("url");
- String cityUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
- String url = StrUtil.format(cityUrl, p);
- String html = HttpRequest.get(url).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements trs = pDoc.select(".citytable .citytr");
- for(Element tr: trs){
- Elements arr = tr.select(" td a");
- String name = "", code = "", href = "";
- for(int i=0; i
- Element e = arr.get(i);
- if(i==0){
- href = e.attr("href");
- code = e.text();
- }else{
- name = e.text();
- }
- }
- JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.CITY.code, RegionType.CITY.name, p.getStr("period"));
- dataList.add(data);
- System.out.println(data.toString());
- JSONObject temp = new JSONObject(data);
- temp.set("href", href);
- temp.set("url", url);
- analyseCounty(temp);
- }
- }
-
- /**
- * 县区级
- */
- public static void analyseCounty(JSONObject p){
- // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/1101.html";
- String pUrl = p.getStr("url");
- String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
- String url = StrUtil.format(countyUrl, p);
- String html = HttpRequest.get(url).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements trs = pDoc.select(".countytable .countytr");
- for(Element tr: trs){
- Elements arr = tr.select("td a");
- String name = "", code = "", href = "";
- for(int i=0; i
- Element e = arr.get(i);
- if(i==0){
- href = e.attr("href");
- code = e.text();
- }else{
- name = e.text();
- }
- }
- JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.COUNTY.code, RegionType.COUNTY.name, p.getStr("period"));
- dataList.add(data);
- System.out.println(data.toString());
- JSONObject temp = new JSONObject(data);
- temp.set("href", href);
- temp.set("url", url);
- analyseTown(temp);
- }
- }
-
- /**
- * 乡镇级:区、镇
- */
- public static void analyseTown(JSONObject p){
- // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/01/110101.html";
- // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{code}/{href}";
- String pUrl = p.getStr("url");
- String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
- String url = StrUtil.format(countyUrl, p);
- String html = HttpRequest.get(url).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements trs = pDoc.select(".towntable .towntr");
- for(Element tr: trs){
- Elements arr = tr.select("td a");
- String name = "", code = "", href = "";
- for(int i=0; i
- Element e = arr.get(i);
- if(i==0){
- href = e.attr("href");
- code = e.text();
- }else{
- name = e.text();
- }
- }
- JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.TOWN.code, RegionType.TOWN.name, p.getStr("period"));
- dataList.add(data);
- System.out.println(data.toString());
- JSONObject temp = new JSONObject(data);
- temp.set("href", href);
- temp.set("url", url);
- analyseVillage(temp);
- }
- }
-
- /**
- * 村落:村落、居委会
- */
- public static void analyseVillage(JSONObject p){
- // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/01/01/110101001.html";
- // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{href}";
- String pUrl = p.getStr("url");
- String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
- String url = StrUtil.format(countyUrl, p);
- String html = HttpRequest.get(url).execute().body();
- Document pDoc = Jsoup.parse(html);
- Elements trs = pDoc.select(".villagetable .villagetr");
- for(Element tr: trs){
- Elements arr = tr.select("td");
- String name = "", code = "", href = "";
- code = arr.get(0).text();
- name = arr.get(2).text();
- JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.VILLAGE.code, RegionType.VILLAGE.name, p.getStr("period"));
- dataList.add(data);
- System.out.println(data.toString());
- }
- }
-
- public enum RegionType{
-
- PROVINCE("1", "省"),
- CITY("2", "地市级"),
- COUNTY("3", "县区级"),
- TOWN("4", "乡镇级"),
- VILLAGE("111", "村落");
-
-
- private String code;
- private String name;
-
- RegionType(String code, String name) {
- this.code = code;
- this.name = name;
- }
-
- public String getCode() {
- return code;
- }
-
- public String getName() {
- return name;
- }
-
- public static String getCode(String code){
- for(RegionType type : RegionType.values()){
- if(type.getCode().equals(code)){
- return type.getName();
- }
- }
- return "无此类型";
- }
- }
-
- public static JSONObject formatData(String name, String code, String pname, String pcode, String typecode,
- String typename, String period){
- Map
data = new HashMap<>(); - data.put("name", name);
- data.put("code", code);
- data.put("pname", pname);
- data.put("pcode", pcode);
- data.put("type_code", typecode);
- data.put("type_name", typename);
- data.put("period", period);
- JSONObject obj = JSONUtil.parseObj(data);
- return obj;
- }
- }
-
相关阅读:
TDengine 3.0 重磅发布,首届开发者大会圆满结束
常见问题: 时间戳如何转换日期时间格式?
NEON快速入门
Double.doubleToLongBits()方法使用
【前端面试题】有关html和css的前端面试27问
how install java on windows
JDBC和GUI实现图书管理系统
如何看待著名游戏引擎 Unity 宣布将更改收费模式,收取「运行时费用」?这将造成哪些影响?
Ubuntu 22.04 进入救援或单用户模式-Ubuntu 22.04忘记root密码
设计模式-桥接模式
-
原文地址:https://blog.csdn.net/glgom/article/details/133746547