`
mywebcode
  • 浏览: 1004882 次
文章分类
社区版块
存档分类
最新评论

ip抓取(二)

 
阅读更多

2、ip抓取第二阶段需求:

根据第一阶段抓取到的数据,向另一个IP查询网站、发送请求到数据查询中心http://199604.com/ip/,获取返回的数据,并抓取其中ip相关属性的数据并保存。

3、程序文档分析:

3.1根据已有的数据去http://199604.com/ip/, ip查询网站发送请求方法为requestPost();

并将返回的html文本存放在HTJF.txt;(利用分页查询SQL语句)每次取出100条,然后100条记录循环发送请求

3.2对接收的文本进行解析、过滤

第一次过滤:过滤完毕

第二次过滤:过滤完毕存进IpOperator1.txt

3.3对已过滤过的IP数据进行处理

3.3.1执行过程中遇到抓取数据不完整的IP,将它写入BugIp.txt文本

3.3.2合法的Ip段,存入ipdata数据表

3.3.3有问题Ip(即开始ip查询的数据与结束IP的数据不一致)存放到ipspecial数据表。

4、程序性能描述:

第一阶段抓到的数据:共3594条

合法ip:3143条

有问题Ip:450条

丢失数据:1条

全程跑完历时:80分钟。期间抛出一次异常。

5、第二价段工作已完毕(耗时2天)

IpDemo2.java

package com.htjf.ip2;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.helper.StringUtil;

import com.mysql.jdbc.StringUtils;
import com.htjf.ip.IpModel;

/**
 * @author Administrator
 * 
 */
public class IpDemo2 {

	/**
	 * @param args
	 */
	public static void main(String args[]) {
		System.out.println("程序入口");
		SqlData sqlData = new SqlData();

		int num = sqlData.findIpCount();
		// int k=0;
		int k = 3339;
		while (k < num) {
			List<IpModel> iplist = sqlData.findIp(k, 100);
			int j = 0;
			for (IpModel ipmodel : iplist) {
				System.out.println("===" + j);
				IpData ipdata = new IpData();
				IpSpecial ipSpecial = new IpSpecial();
				j++;
				String startIp = ipmodel.getStartIp();
				System.out.println("startIp:" + startIp);

				try {
					Map<String, String> ipmap = requestPost(startIp);
					ipdata.setIpId(ipmap.get("ipId"));
					ipdata.setStartIp(ipmap.get("startIp"));
					ipdata.setCountry(ipmap.get("country"));
					ipdata.setProvince(ipmap.get("province"));// 省份
					ipdata.setCity(ipmap.get("city"));// 城市
					ipdata.setOperator(ipmap.get("operator"));// 运营商
				} catch (IOException e1) {
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}

				String endIp = ipmodel.getEndIp().trim();
				System.out.println("endIp:" + endIp);

				try {
					Map<String, String> ipmap2 = requestPost(endIp);
					ipSpecial.setIpId(ipmap2.get("ipId"));
					ipSpecial.setEndIp(ipmap2.get("startIp"));// /结束ip
					ipSpecial.setCountry(ipmap2.get("country"));
					ipSpecial.setProvince(ipmap2.get("province"));// 省份
					ipSpecial.setCity(ipmap2.get("city"));// 城市
					ipSpecial.setOperator(ipmap2.get("operator"));// 运营商
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				/*
				 * if(StringUtil.isBlank(ipdata.getCity())){ ipdata.setCity("");
				 * }else if (StringUtil.isBlank(ipSpecial.getCity())) {
				 * ipdata.setCity(""); }
				 */
				if (ipdata.getCity().equals(ipSpecial.getCity())) {
					ipdata.setEndIp(ipSpecial.getEndIp());
					sqlData.insertIp(ipdata);// /System.out.println("---数据添加---");

				} else {
					ipSpecial.setStartIp(ipdata.getStartIp());
					sqlData.insertIpSpecial(ipSpecial);
					System.out.println("---特殊IP---");
				}

			}
			k = k + iplist.size();
		}

	}

	/**
	 * @param ipString
	 * @throws IOException
	 *             发送请求
	 */
	public static Map<String, String> requestPost(String ipString)
			throws IOException {

		URL url = new URL("http://199604.com/ip/");
		URLConnection connection = url.openConnection();
		/**
		 * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
		 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
		 */
		connection.setConnectTimeout(500000);
		connection.setDoOutput(true); // 是否向服务器发送数据
		connection.setReadTimeout(300000);
		/**
		 * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
		 */
		OutputStreamWriter out = new OutputStreamWriter(
				connection.getOutputStream(), "8859_1");
		out.write("ip=" + ipString + "&action=2"); // 向页面传递数据。post的关键所在!
		// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!
		// remember to clean up
		out.flush();
		out.close();
		/**
		 * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
		 * text/plain Content-type: application/x-www-form-urlencoded
		 * Content-length: 99 username=bob password=someword
		 */
		// 一旦发送成功,用以下方法就可以得到服务器的回应:
		String sCurrentLine;
		String sTotalString;
		sCurrentLine = "";
		sTotalString = "";
		InputStream l_urlStream;
		l_urlStream = connection.getInputStream();// 获取返回的Html内容
		// 传说中的三层包装阿!

		BufferedReader l_reader = new BufferedReader(new InputStreamReader(
				l_urlStream));

		String html_regex = "<(.[^>]*)>";// /过滤标签的规则
		Pattern p = Pattern.compile(html_regex);// 将规则封装成对象

		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://IpHTML.txt")));
		while ((sCurrentLine = l_reader.readLine()) != null) {

			sCurrentLine = sCurrentLine.replaceAll("<tr>", "ipOperator:");
			sCurrentLine = sCurrentLine.replaceAll("</td>", ",");
			sCurrentLine = sCurrentLine.replaceAll(html_regex, "");
			bufw.write(sCurrentLine);
			bufw.newLine();// /换行
			bufw.flush();// 刷新

		}
		bufw.close();
		System.out.println("第一次过滤完毕,开始下一轮过滤");
		String ipstr = saveIPOperator();
		System.out.println("第一次过滤完毕,开始下一轮过滤");
		Map<String, String> ipmap = saveIPOperator2(ipstr, ipString);

		return ipmap;
	}

	/**
	 * @return
	 * @throws IOException
	 *             过滤一
	 */
	public static String saveIPOperator() throws IOException {
		BufferedReader bufr = new BufferedReader(new InputStreamReader(
				new FileInputStream("E://IpHTML.txt")));
		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://IpOperator1.txt")));
		String ip_regex1 = "ipOperator:";// /IP的匹配规则
		int k = 1;
		String ipstr = "";
		String line = null;
		while ((line = bufr.readLine()) != null) {

			Pattern p = Pattern.compile(ip_regex1);// 将规则封装成对象
			Matcher m = p.matcher(line);// 一行一行地进行匹配

			while (m.find()) {
				if (k == 2) {
					Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");
					Matcher m2 = p2.matcher(line);
					String line2 = m2.replaceAll("");
					ipstr = line2;
					bufw.write(line2);
					bufw.newLine();// /换行
					bufw.flush();// 刷新
				}
				k++;
			}
		}

		bufw.close();

		return ipstr;

	}

	/**
	 * @param ipstr
	 * @throws IOException
	 */
	public static Map<String, String> saveIPOperator2(String ipstr,
			String ipString) throws IOException {
		String iparray[] = new String[2];
		// IpData ipdata=new IpData();
		iparray = ipstr.split("\\:");
		String ipos[] = iparray[1].split(",");
		/*
		 * for(int i=0;i<ipos.length;i++){ System.out.println("===ip:"+ipos[i]);
		 * }
		 */

		BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(
				new FileOutputStream("E://BugIp.txt", true)));

		Map<String, String> ipmap2 = new HashMap<String, String>();
		ipmap2.put("Sip", "");

		SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");
		String ipId = getRandomString(14) + sdf.format(new Date());
		Map<String, String> ipmap = new HashMap<String, String>();
		ipmap.put("ipId", ipId);
		ipmap.put("startIp", ipString);

		try {
			ipmap.put("country", ipos[1]);

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ipmap.put("country", "");
			ipmap2.put("Sip", ipString);
		}

		try {
			ipmap.put("province", ipos[2]);

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ipmap.put("province", "");
			ipmap2.put("Sip", ipString);
		}

		try {
			ipmap.put("city", ipos[3]);

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ipmap.put("city", "");
			ipmap2.put("Sip", ipString);
		}

		try {
			ipmap.put("county", ipos[4]);

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ipmap.put("county", "");
		}

		try {
			ipmap.put("operator", ipos[5]);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ipmap.put("operator", "");
			ipmap2.put("Sip", ipString);
		}

		System.out.println("----------" + ipmap2.get("Sip"));
		bufw.write(ipmap2.get("Sip"));
		bufw.newLine();
		bufw.flush();
		bufw.close();

		/*
		 * ipdata.setIpId(ipId); ipdata.setStartIp(ipos[0]);
		 * ipdata.setCountry(ipos[1]); ipdata.setProvince(ipos[2]);//省份
		 * ipdata.setCity(ipos[3]);//城市 ipdata.setOperator(ipos[5]);//运营商
		 */

		return ipmap;

	}

	/**
	 * @param length
	 * @return 生成随机数
	 */
	public static String getRandomString(int length) { // length表示生成字符串的长度
		String base = "abcdefghijklmnopqrstuvwxyz0123456789";
		Random random = new Random();
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < length; i++) {
			int number = random.nextInt(base.length());
			sb.append(base.charAt(number));
		}
		return sb.toString();
	}

}// //////////////

class SqlData {

	public static String username;
	public static String password;
	public static Connection connection;
	public static PreparedStatement ps;

	// //构造函数
	public SqlData() {

		String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";
		String username = "root";
		String password = "";
		// 加载驱动程序以连接数据库
		try {
			Class.forName("com.mysql.jdbc.Driver");
			connection = DriverManager.getConnection(url, username, password);
		}
		// 捕获加载驱动程序异常
		catch (ClassNotFoundException cnfex) {
			System.err.println("装载 JDBC/ODBC 驱动程序失败");
			cnfex.printStackTrace();
		}
		// 捕获连接数据库异常
		catch (SQLException sqlex) {
			System.err.println("无法连接数据库");
			sqlex.printStackTrace();
		}

	}

	/**
	 * @param ipModel
	 * @return 查询 数据
	 */
	public int findIpCount() {
		// java.util.List<IpModel> list = new ArrayList<IpModel>();
		int num = 0;
		try {

			ps = connection.prepareStatement("select count(*) from iptable");

			ResultSet rs = ps.executeQuery();
			rs.next();
			num = rs.getInt(1);
			// ps.close();

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.out.println("====count:" + num);

		return num;

	}

	/**
	 * @param ipModel
	 * @return 查询 数据
	 */
	public List<IpModel> findIp(int offset, int amount) {
		java.util.List<IpModel> listiptable = new ArrayList<IpModel>();
		try {

			ps = connection.prepareStatement("select * from iptable limit ?,?");
			ps.setInt(1, offset);
			ps.setInt(2, amount);

			ResultSet rs = ps.executeQuery();

			while (rs.next()) {
				IpModel ipmodel2 = new IpModel();
				ipmodel2.setIpId(rs.getString("ip_id"));
				ipmodel2.setStartIp(rs.getString("startIp"));
				System.out.println("iptable:" + rs.getString("startIp"));
				ipmodel2.setEndIp(rs.getString("endIp"));
				System.out.println("iptable:" + rs.getString("endIp"));
				listiptable.add(ipmodel2);
			}

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return listiptable;

	}

	/**
	 * @param ipModel
	 *            添加数据到ipdata数据表,符合要求的Ip
	 */
	public void insertIp(IpData ipData) {
		SqlData ipsql = new SqlData();
		List<IpData> list = ipsql.findIpData(ipData);
		if (list.size() > 0) {
			System.out.println("已存在有数据");
		} else {
			try {

				ps = connection
						.prepareStatement("insert into ipdata (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");
				/*
				 * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
				 * String ipId=sdf.format(new Date());
				 */
				ps.setString(1, ipData.getIpId());
				ps.setString(2, ipData.getCountry());
				ps.setString(3, ipData.getProvince());
				ps.setString(4, ipData.getCity());
				ps.setString(5, ipData.getCounty());
				ps.setString(6, ipData.getOperator());
				ps.setString(7, ipData.getStartIp());
				ps.setString(8, ipData.getEndIp());
				ps.executeUpdate();
				System.out.println("------记录插入成功------");

			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	/**
	 * @return 查询IPData,符合要求的IP的数据表
	 */
	public List<IpData> findIpData(IpData ipData11) {
		java.util.List<IpData> list = new ArrayList<IpData>();
		try {

			ps = connection
					.prepareStatement("select * from ipdata where startIp=? and endIp=?");
			ps.setString(1, ipData11.getStartIp());
			ps.setString(2, ipData11.getEndIp());

			ResultSet rs = ps.executeQuery();
			IpData ipData21 = new IpData();

			while (rs.next()) {
				ipData21.setIpId(rs.getString("ip_id"));
				ipData21.setStartIp(rs.getString("startIp"));
				ipData21.setEndIp(rs.getString("endIp"));
				list.add(ipData21);
			}

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return list;

	}

	/**
	 * @param ipModel
	 *            添加数据到ipdata数据表,符合要求的Ip
	 */
	public void insertIpSpecial(IpSpecial ipData) {
		SqlData ipsql = new SqlData();
		List<IpSpecial> list = ipsql.findIpSpecial(ipData);
		if (list.size() > 0) {
			System.out.println("已存在有数据");
		} else {
			try {

				ps = connection
						.prepareStatement("insert into ipspecial (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");
				/*
				 * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss");
				 * String ipId=sdf.format(new Date());
				 */
				ps.setString(1, ipData.getIpId());
				ps.setString(2, ipData.getCountry());
				ps.setString(3, ipData.getProvince());
				ps.setString(4, ipData.getCity());
				ps.setString(5, ipData.getCounty());
				ps.setString(6, ipData.getOperator());
				ps.setString(7, ipData.getStartIp());
				ps.setString(8, ipData.getEndIp());
				ps.executeUpdate();
				System.out.println("-------特殊Ip插入成功------");

			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	/**
	 * @return 查询IpSpecial,特殊的IP
	 */
	public List<IpSpecial> findIpSpecial(IpSpecial ipSpecial) {
		java.util.List<IpSpecial> list = new ArrayList<IpSpecial>();
		try {

			ps = connection
					.prepareStatement("select * from ipspecial where startIp=? and endIp=?");
			ps.setString(1, ipSpecial.getStartIp());
			ps.setString(2, ipSpecial.getEndIp());

			ResultSet rs = ps.executeQuery();
			IpSpecial ipSpecial2 = new IpSpecial();

			while (rs.next()) {
				ipSpecial2.setIpId(rs.getString("ip_id"));
				ipSpecial2.setStartIp(rs.getString("startIp"));
				ipSpecial2.setEndIp(rs.getString("endIp"));
				list.add(ipSpecial2);
			}

		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return list;

	}

}




分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics