• 国家行政区代码


    java爬取国家行政区代码数据

    1. import cn.hutool.core.util.StrUtil;
    2. import cn.hutool.http.HttpRequest;
    3. import cn.hutool.json.JSONArray;
    4. import cn.hutool.json.JSONObject;
    5. import cn.hutool.json.JSONUtil;
    6. import org.jsoup.Jsoup;
    7. import org.jsoup.nodes.Document;
    8. import org.jsoup.nodes.Element;
    9. import org.jsoup.select.Elements;
    10. import java.util.*;
    11. /**
    12. * @author glgom
    13. * @title RegionCrawler
    14. * @description 国家行政区数据爬虫,数据来源于国家统计局
    15. */
    16. public class RegionCrawler {
    17. /** 行政区代码数据存在年度 */
    18. private static String periodUrl = "http://www.stats.gov.cn/sj/tjbz/qhdm/";
    19. /** 爬取数据网址 */
    20. private static String provinceUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/index.html";
    21. private static JSONArray dataList = new JSONArray();
    22. public static void main(String[] args) {
    23. // analysePeriod();
    24. analyseProvince("2023");
    25. }
    26. public static void analysePeriod(){
    27. String html = HttpRequest.get(periodUrl).execute().body();
    28. Document pDoc = Jsoup.parse(html);
    29. Elements lis = pDoc.select(".list-content ul li");
    30. List periods = new ArrayList<>();
    31. for(int i=0; i
    32. Element e = lis.get(i);
    33. Elements arr = e.select("a");
    34. String period = arr.get(0).text().replace("年", "");
    35. periods.add(period);
    36. }
    37. Collections.reverse(periods); // 反转年度
    38. for(String period: periods){
    39. dataList.clear(); //清理数据,每年
    40. analyseProvince(period);
    41. }
    42. }
    43. /**
    44. * 省
    45. */
    46. public static void analyseProvince(String period){
    47. // String provinceUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html";
    48. Map formatMap = new HashMap();
    49. formatMap.put("period", period);
    50. String url = StrUtil.format(provinceUrl, formatMap);
    51. String html = HttpRequest.get(url).execute().body();
    52. Document pDoc = Jsoup.parse(html);
    53. Elements arr = pDoc.select(".provincetable .provincetr td a");
    54. for(Element e: arr){
    55. String href = e.attr("href");
    56. String name = e.text();
    57. String code = href.substring(0, href.indexOf(".html"));
    58. JSONObject data = formatData(name, code, "中国", "1", RegionType.PROVINCE.code, RegionType.PROVINCE.name, period);
    59. dataList.add(data);
    60. System.out.println(data.toString());
    61. JSONObject temp = new JSONObject(data);
    62. temp.set("href", href);
    63. temp.set("url", url);
    64. analyseCity(temp);
    65. }
    66. System.out.println(dataList.toString());
    67. }
    68. /**
    69. * 地市级
    70. */
    71. public static void analyseCity(JSONObject p){
    72. // String cityUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11.html";
    73. // String cityUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{href}";
    74. String pUrl = p.getStr("url");
    75. String cityUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
    76. String url = StrUtil.format(cityUrl, p);
    77. String html = HttpRequest.get(url).execute().body();
    78. Document pDoc = Jsoup.parse(html);
    79. Elements trs = pDoc.select(".citytable .citytr");
    80. for(Element tr: trs){
    81. Elements arr = tr.select(" td a");
    82. String name = "", code = "", href = "";
    83. for(int i=0; i
    84. Element e = arr.get(i);
    85. if(i==0){
    86. href = e.attr("href");
    87. code = e.text();
    88. }else{
    89. name = e.text();
    90. }
    91. }
    92. JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.CITY.code, RegionType.CITY.name, p.getStr("period"));
    93. dataList.add(data);
    94. System.out.println(data.toString());
    95. JSONObject temp = new JSONObject(data);
    96. temp.set("href", href);
    97. temp.set("url", url);
    98. analyseCounty(temp);
    99. }
    100. }
    101. /**
    102. * 县区级
    103. */
    104. public static void analyseCounty(JSONObject p){
    105. // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/1101.html";
    106. String pUrl = p.getStr("url");
    107. String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
    108. String url = StrUtil.format(countyUrl, p);
    109. String html = HttpRequest.get(url).execute().body();
    110. Document pDoc = Jsoup.parse(html);
    111. Elements trs = pDoc.select(".countytable .countytr");
    112. for(Element tr: trs){
    113. Elements arr = tr.select("td a");
    114. String name = "", code = "", href = "";
    115. for(int i=0; i
    116. Element e = arr.get(i);
    117. if(i==0){
    118. href = e.attr("href");
    119. code = e.text();
    120. }else{
    121. name = e.text();
    122. }
    123. }
    124. JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.COUNTY.code, RegionType.COUNTY.name, p.getStr("period"));
    125. dataList.add(data);
    126. System.out.println(data.toString());
    127. JSONObject temp = new JSONObject(data);
    128. temp.set("href", href);
    129. temp.set("url", url);
    130. analyseTown(temp);
    131. }
    132. }
    133. /**
    134. * 乡镇级:区、镇
    135. */
    136. public static void analyseTown(JSONObject p){
    137. // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/01/110101.html";
    138. // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{code}/{href}";
    139. String pUrl = p.getStr("url");
    140. String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
    141. String url = StrUtil.format(countyUrl, p);
    142. String html = HttpRequest.get(url).execute().body();
    143. Document pDoc = Jsoup.parse(html);
    144. Elements trs = pDoc.select(".towntable .towntr");
    145. for(Element tr: trs){
    146. Elements arr = tr.select("td a");
    147. String name = "", code = "", href = "";
    148. for(int i=0; i
    149. Element e = arr.get(i);
    150. if(i==0){
    151. href = e.attr("href");
    152. code = e.text();
    153. }else{
    154. name = e.text();
    155. }
    156. }
    157. JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.TOWN.code, RegionType.TOWN.name, p.getStr("period"));
    158. dataList.add(data);
    159. System.out.println(data.toString());
    160. JSONObject temp = new JSONObject(data);
    161. temp.set("href", href);
    162. temp.set("url", url);
    163. analyseVillage(temp);
    164. }
    165. }
    166. /**
    167. * 村落:村落、居委会
    168. */
    169. public static void analyseVillage(JSONObject p){
    170. // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11/01/01/110101001.html";
    171. // String countyUrl = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/{period}/{href}";
    172. String pUrl = p.getStr("url");
    173. String countyUrl = pUrl.substring(0, pUrl.lastIndexOf("/"))+"/{href}";
    174. String url = StrUtil.format(countyUrl, p);
    175. String html = HttpRequest.get(url).execute().body();
    176. Document pDoc = Jsoup.parse(html);
    177. Elements trs = pDoc.select(".villagetable .villagetr");
    178. for(Element tr: trs){
    179. Elements arr = tr.select("td");
    180. String name = "", code = "", href = "";
    181. code = arr.get(0).text();
    182. name = arr.get(2).text();
    183. JSONObject data = formatData(name, code, p.getStr("name"),p.getStr("code"), RegionType.VILLAGE.code, RegionType.VILLAGE.name, p.getStr("period"));
    184. dataList.add(data);
    185. System.out.println(data.toString());
    186. }
    187. }
    188. public enum RegionType{
    189. PROVINCE("1", "省"),
    190. CITY("2", "地市级"),
    191. COUNTY("3", "县区级"),
    192. TOWN("4", "乡镇级"),
    193. VILLAGE("111", "村落");
    194. private String code;
    195. private String name;
    196. RegionType(String code, String name) {
    197. this.code = code;
    198. this.name = name;
    199. }
    200. public String getCode() {
    201. return code;
    202. }
    203. public String getName() {
    204. return name;
    205. }
    206. public static String getCode(String code){
    207. for(RegionType type : RegionType.values()){
    208. if(type.getCode().equals(code)){
    209. return type.getName();
    210. }
    211. }
    212. return "无此类型";
    213. }
    214. }
    215. public static JSONObject formatData(String name, String code, String pname, String pcode, String typecode,
    216. String typename, String period){
    217. Map data = new HashMap<>();
    218. data.put("name", name);
    219. data.put("code", code);
    220. data.put("pname", pname);
    221. data.put("pcode", pcode);
    222. data.put("type_code", typecode);
    223. data.put("type_name", typename);
    224. data.put("period", period);
    225. JSONObject obj = JSONUtil.parseObj(data);
    226. return obj;
    227. }
    228. }

  • 相关阅读:
    TDengine 3.0 重磅发布,首届开发者大会圆满结束
    常见问题: 时间戳如何转换日期时间格式?
    NEON快速入门
    Double.doubleToLongBits()方法使用
    【前端面试题】有关html和css的前端面试27问
    how install java on windows
    JDBC和GUI实现图书管理系统
    如何看待著名游戏引擎 Unity 宣布将更改收费模式,收取「运行时费用」?这将造成哪些影响?
    Ubuntu 22.04 进入救援或单用户模式-Ubuntu 22.04忘记root密码
    设计模式-桥接模式
  • 原文地址:https://blog.csdn.net/glgom/article/details/133746547