字数:8904

官方资源

国家统计局设管司2017-03-10发布的数据链接在这里,根据下面程序的统计,目前有共3216个行政单位,细分如下:

  • 34个一级行政单位,即省级。指省、直辖市,包括港澳台
  • 447个二级单位,即市级。这其中包括: 113个区、县升级为市级,因区县所在市为直辖市,或县被省直辖。
  • 2735个三级单位,即区县级

写爬虫

这是一个非常简单的爬取,仅一个页面,页面元素也是很简单,仅需要对一些数据做一些关联和分级,过滤和排除掉一些概念性的数据,比如120100 市辖区139000 省直辖县级行政区划等。

这里有一个python写的爬虫,用python 2.7的环境,带配置bs4、uniout跑起来过,但不是很稳定,数据的清洗和关系建立也不是很完整或自己想要的。匆忙用java写了一个爬虫,核心代码仅:

public void analyse(){
	try {
		URL url = new URL("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html");
		Document doc = Jsoup.parse(url, 10000);
		Elements items = doc.getElementsByClass("MsoNormal");
		List<String> ss = new ArrayList<String>();
		List<State> states = new ArrayList<State>();
		
		String currProvinceCode = null; 
		String currCityCode = null;
		String parentCode = null;
		boolean isUp = false;
		
		for (int i = 0; i < items.size(); i++) {
			String code = items.get(i).select("span[lang=\"EN-US\"]").text();
			String name = items.get(i).select("span[style=\"font-family: 宋体\"]").text();
			code = code.trim().replace("    ", "").replace(" ", "");
			name = name.trim().replace(" ", "").replace(" ", "");
							
			int level = 0;	
			
			if (code.endsWith("0000")) {//省或直辖市
				level = 1;
				currProvinceCode = code;
				parentCode = "0";
				isUp = false;//变回
				ss.add("级别:【"+ level +"】省:【" + name + "】,代码:【"  + code + "】");
			} else if (code.endsWith("00")) {//市,00结尾
				//有可能是以下统称概念
				if ("市辖区".equals(name) //有可能是直辖市市区
						|| "省直辖县级行政区划".equals(name) //或省直辖的县
						|| "自治区直辖县级行政区划".equals(name)//州直辖县
						|| "县".equals(name)//或直辖市直辖的县区
						) {
					level = 0; //不 入库
					isUp = true;//行政级别提升一级						
					continue;
				}
				isUp = false;//变回
				level = 2;
				currCityCode = code;//市
				parentCode = currProvinceCode;
				ss.add("级别:【"+ level +"】市:【" + name + "】,代码:【"  + code + "】,父级代码:【"+ parentCode +"】");
			//01结尾一般是一个统称的概念,比如【市辖区】
			} else if (code.endsWith("01") && "市辖区".equals(name)) { //子集统称,第二个条件排除掉需要入库的情况:有可能是直辖市区或县级市
			} else {//县区
				if (isUp) {//直辖市里的区(县)提升一个行政等级
					level = 2;						
					parentCode = currProvinceCode;
				} else {
					level = 3;
					parentCode = currCityCode;
				}					
				ss.add("级别:【"+ level +"】直辖市市区、直辖市直辖县、省直辖县、县:【" + name + "】,代码:【"  + code + "】,父级代码:【"+ parentCode +"】");
			}
			if (level != 0) {
				//impl.addState(new State(Integer.parseInt(code), name, level, parentCode));
				states.add(new State(Integer.parseInt(code), name, level, parentCode, isUp));
			}
			//logger.debug(ss.toString());
		}	
		if (states.size() > 0) {
			//保存到数据
			impl.addStates(states);
			
			//保存到json
			JSONArray ja = new JSONArray();
			for (State s : states) {
				ja.put(new JSONObject(String.format("{\"code\":%s,\"name\":%s,\"layer\":%s,\"parentCode\":%s,\"isUP\":%s}"
						, s.getCode(), s.getName(), s.getLevel(), s.getParentCode(), s.isIsup())));
			}
			System.out.println(ja.toString());
		}
	} catch (MalformedURLException e) {
		e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	}
}

爬取形成的json详情见此

更多的java代码

  • maven依赖:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>de.tao</groupId>
  <artifactId>state</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <properties>
		<jdk.version>1.7</jdk.version>
		<spring.version>4.1.6.RELEASE</spring.version>		
		<logback.version>1.1.3</logback.version>
		<jcl.slf4j.version>1.7.12</jcl.slf4j.version>		
	</properties>

	<dependencies>

		<!-- Spring JDBC -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-jdbc</artifactId>
			<version>${spring.version}</version>
		</dependency>
		
<!-- 		mysql -->

		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.13</version>
		</dependency>

		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.2</version>
		</dependency>
		
		<!-- logging -->
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>jcl-over-slf4j</artifactId>
			<version>${jcl.slf4j.version}</version>
		</dependency>

		<dependency>
			<groupId>ch.qos.logback</groupId>
			<artifactId>logback-classic</artifactId>
			<version>${logback.version}</version>
		</dependency>
		
		<!-- https://mvnrepository.com/artifact/org.json/json -->
		<dependency>
		    <groupId>org.json</groupId>
		    <artifactId>json</artifactId>
		    <version>20171018</version>
		</dependency>	
	</dependencies>
</project>  
  • 数据源:
package de.tao.state.db;


import javax.sql.DataSource;

import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;

import com.mysql.jdbc.jdbc2.optional.MysqlDataSource;

public class DbUtil {
	static DbUtil util = null;
	
	private DbUtil() {
		
	}
	
	public static DbUtil getInstance() {
		if (util == null)
			util = new DbUtil();
		return util;
	}
	
	public DataSource getDataSource() {
		 MysqlDataSource mysqlDataSource = new MysqlDataSource();
		 mysqlDataSource.setURL("jdbc:mysql://xxx.com/test?user=root&password=xxx&useUnicode=true&characterEncoding=utf8");
		 return mysqlDataSource;
	}
	
	public NamedParameterJdbcTemplate getNamedParameterJdbcTemplate() {
		return new NamedParameterJdbcTemplate(getDataSource());
	}
}
  • 采用spring jdbc的dao
package de.tao.state.db;

import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.jdbc.core.namedparam.MapSqlParameterSource;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.jdbc.core.namedparam.SqlParameterSource;

import de.tao.state.bean.State;

public class DaoImpl {
	private final Logger logger = LoggerFactory.getLogger(DaoImpl.class);
	static NamedParameterJdbcTemplate template = DbUtil.getInstance().getNamedParameterJdbcTemplate();
		
	public void addStates(List<State> states) {
		SqlParameterSource[] params = new SqlParameterSource[states.size()];
		for (int i = 0; i < states.size(); i++) {
			params[i] = getSqlParameterByModel(states.get(i));
		}
		template.batchUpdate("insert into state(code, name, level, parentcode, isup) values(:code, :name, :level, :parentCode, :isup)", params);
		//logger.debug("success, {}", succ);
	}
	
	private SqlParameterSource getSqlParameterByModel(State user) {
		MapSqlParameterSource paramSource = new MapSqlParameterSource();
		paramSource.addValue("name", user.getName());
		paramSource.addValue("code", user.getCode());
		paramSource.addValue("level", user.getLevel());
		paramSource.addValue("parentCode", user.getParentCode());
		paramSource.addValue("isup", user.isIsup());
		return paramSource;
	}

}
  • bean:
package de.tao.state.bean;

public class State {
	String name;
	int code;
	int level;
	int parentCode;
	boolean isup;
	
	public State(int code, String name, int level, String parentCode, boolean isup) {
		 this.code = code;
		 this.name = name;
		 this.level = level;
		 this.parentCode = Integer.parseInt(parentCode);
		 this.isup = isup;
	}
	
	public boolean isIsup() {
		return isup;
	}


	public void setIsup(boolean isup) {
		this.isup = isup;
	}


	public int getParentCode() {
		return parentCode;
	}

	public void setParentCode(int parentCode) {
		this.parentCode = parentCode;
	}

	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public int getCode() {
		return code;
	}
	public void setCode(int code) {
		this.code = code;
	}

	public int getLevel() {
		return level;
	}

	public void setLevel(int level) {
		this.level = level;
	}
	
	
}
  • logback配置:
<?xml version="1.0" encoding="UTF-8"?>
<configuration>

	<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
		<layout class="ch.qos.logback.classic.PatternLayout">

			<Pattern>
				%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n
			</Pattern>

		</layout>
	</appender>

	<logger name="org.springframework" level="error" additivity="false">
		<appender-ref ref="STDOUT" />
	</logger>
	
	<logger name="de.tao.state" level="debug" additivity="false">
		<appender-ref ref="STDOUT" />
	</logger>
	
	<root level="error">
		<appender-ref ref="STDOUT" />
	</root>

</configuration>