字数:8904
官方资源
国家统计局设管司2017-03-10发布的数据链接在这里,根据下面程序的统计,目前有共3216个行政单位,细分如下:
- 34个一级行政单位,即省级。指省、直辖市,包括港澳台
- 447个二级单位,即市级。这其中包括: 113个区、县升级为市级,因区县所在市为直辖市,或县被省直辖。
- 2735个三级单位,即区县级
写爬虫
这是一个非常简单的爬取,仅一个页面,页面元素也是很简单,仅需要对一些数据做一些关联和分级,过滤和排除掉一些概念性的数据,比如120100 市辖区
、139000 省直辖县级行政区划
等。
这里有一个python写的爬虫,用python 2.7的环境,带配置bs4、uniout跑起来过,但不是很稳定,数据的清洗和关系建立也不是很完整或自己想要的。匆忙用java写了一个爬虫,核心代码仅:
public void analyse(){
try {
URL url = new URL("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html");
Document doc = Jsoup.parse(url, 10000);
Elements items = doc.getElementsByClass("MsoNormal");
List<String> ss = new ArrayList<String>();
List<State> states = new ArrayList<State>();
String currProvinceCode = null;
String currCityCode = null;
String parentCode = null;
boolean isUp = false;
for (int i = 0; i < items.size(); i++) {
String code = items.get(i).select("span[lang=\"EN-US\"]").text();
String name = items.get(i).select("span[style=\"font-family: 宋体\"]").text();
code = code.trim().replace(" ", "").replace(" ", "");
name = name.trim().replace(" ", "").replace(" ", "");
int level = 0;
if (code.endsWith("0000")) {//省或直辖市
level = 1;
currProvinceCode = code;
parentCode = "0";
isUp = false;//变回
ss.add("级别:【"+ level +"】省:【" + name + "】,代码:【" + code + "】");
} else if (code.endsWith("00")) {//市,00结尾
//有可能是以下统称概念
if ("市辖区".equals(name) //有可能是直辖市市区
|| "省直辖县级行政区划".equals(name) //或省直辖的县
|| "自治区直辖县级行政区划".equals(name)//州直辖县
|| "县".equals(name)//或直辖市直辖的县区
) {
level = 0; //不 入库
isUp = true;//行政级别提升一级
continue;
}
isUp = false;//变回
level = 2;
currCityCode = code;//市
parentCode = currProvinceCode;
ss.add("级别:【"+ level +"】市:【" + name + "】,代码:【" + code + "】,父级代码:【"+ parentCode +"】");
//01结尾一般是一个统称的概念,比如【市辖区】
} else if (code.endsWith("01") && "市辖区".equals(name)) { //子集统称,第二个条件排除掉需要入库的情况:有可能是直辖市区或县级市
} else {//县区
if (isUp) {//直辖市里的区(县)提升一个行政等级
level = 2;
parentCode = currProvinceCode;
} else {
level = 3;
parentCode = currCityCode;
}
ss.add("级别:【"+ level +"】直辖市市区、直辖市直辖县、省直辖县、县:【" + name + "】,代码:【" + code + "】,父级代码:【"+ parentCode +"】");
}
if (level != 0) {
//impl.addState(new State(Integer.parseInt(code), name, level, parentCode));
states.add(new State(Integer.parseInt(code), name, level, parentCode, isUp));
}
//logger.debug(ss.toString());
}
if (states.size() > 0) {
//保存到数据
impl.addStates(states);
//保存到json
JSONArray ja = new JSONArray();
for (State s : states) {
ja.put(new JSONObject(String.format("{\"code\":%s,\"name\":%s,\"layer\":%s,\"parentCode\":%s,\"isUP\":%s}"
, s.getCode(), s.getName(), s.getLevel(), s.getParentCode(), s.isIsup())));
}
System.out.println(ja.toString());
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
爬取形成的json详情见此。
更多的java代码
- maven依赖:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.tao</groupId>
<artifactId>state</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<jdk.version>1.7</jdk.version>
<spring.version>4.1.6.RELEASE</spring.version>
<logback.version>1.1.3</logback.version>
<jcl.slf4j.version>1.7.12</jcl.slf4j.version>
</properties>
<dependencies>
<!-- Spring JDBC -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>${spring.version}</version>
</dependency>
<!-- mysql -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.13</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!-- logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${jcl.slf4j.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>${logback.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.json/json -->
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20171018</version>
</dependency>
</dependencies>
</project>
- 数据源:
package de.tao.state.db;
import javax.sql.DataSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import com.mysql.jdbc.jdbc2.optional.MysqlDataSource;
public class DbUtil {
static DbUtil util = null;
private DbUtil() {
}
public static DbUtil getInstance() {
if (util == null)
util = new DbUtil();
return util;
}
public DataSource getDataSource() {
MysqlDataSource mysqlDataSource = new MysqlDataSource();
mysqlDataSource.setURL("jdbc:mysql://xxx.com/test?user=root&password=xxx&useUnicode=true&characterEncoding=utf8");
return mysqlDataSource;
}
public NamedParameterJdbcTemplate getNamedParameterJdbcTemplate() {
return new NamedParameterJdbcTemplate(getDataSource());
}
}
- 采用spring jdbc的dao
package de.tao.state.db;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.jdbc.core.namedparam.SqlParameterSource;
import de.tao.state.bean.State;
public class DaoImpl {
private final Logger logger = LoggerFactory.getLogger(DaoImpl.class);
static NamedParameterJdbcTemplate template = DbUtil.getInstance().getNamedParameterJdbcTemplate();
public void addStates(List<State> states) {
SqlParameterSource[] params = new SqlParameterSource[states.size()];
for (int i = 0; i < states.size(); i++) {
params[i] = getSqlParameterByModel(states.get(i));
}
template.batchUpdate("insert into state(code, name, level, parentcode, isup) values(:code, :name, :level, :parentCode, :isup)", params);
//logger.debug("success, {}", succ);
}
private SqlParameterSource getSqlParameterByModel(State user) {
MapSqlParameterSource paramSource = new MapSqlParameterSource();
paramSource.addValue("name", user.getName());
paramSource.addValue("code", user.getCode());
paramSource.addValue("level", user.getLevel());
paramSource.addValue("parentCode", user.getParentCode());
paramSource.addValue("isup", user.isIsup());
return paramSource;
}
}
- bean:
package de.tao.state.bean;
public class State {
String name;
int code;
int level;
int parentCode;
boolean isup;
public State(int code, String name, int level, String parentCode, boolean isup) {
this.code = code;
this.name = name;
this.level = level;
this.parentCode = Integer.parseInt(parentCode);
this.isup = isup;
}
public boolean isIsup() {
return isup;
}
public void setIsup(boolean isup) {
this.isup = isup;
}
public int getParentCode() {
return parentCode;
}
public void setParentCode(int parentCode) {
this.parentCode = parentCode;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public int getLevel() {
return level;
}
public void setLevel(int level) {
this.level = level;
}
}
- logback配置:
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<layout class="ch.qos.logback.classic.PatternLayout">
<Pattern>
%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n
</Pattern>
</layout>
</appender>
<logger name="org.springframework" level="error" additivity="false">
<appender-ref ref="STDOUT" />
</logger>
<logger name="de.tao.state" level="debug" additivity="false">
<appender-ref ref="STDOUT" />
</logger>
<root level="error">
<appender-ref ref="STDOUT" />
</root>
</configuration>