你的分享就是我们的动力 ---﹥

一段非常简单的多线程爬虫程序,支持深度控制

时间:2013-05-22 14:56来源:www.chengxuyuans.com 点击:

代码简介

Fetch抽象类,具体实现需要继承该类,实现process方法,使用jsoup解析文本,Httpclient模拟浏览器请求.

代码片段

import java.io.InputStream;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

public abstract class Fetch implements Runnable {
	private static final String HTTP_PROTROL = "http://";
	public final static String USER_AGENT_H = "User-Agent";
	public final static String REFERER_H = "Referer";
	public final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22";
	private U root;
	private int depth;
	private String host;

	public String getHost() {
		return host;
	}

	public void setHost(String host) {
		this.host = host;
	}

	public Fetch(U root, int depth) {
		super();
		this.root = root;
		this.depth = depth;
	}

	public Fetch(String root, int depth) {
		this.root = new U(0, root);
		this.depth = depth;
	}

	public U getRoot() {
		return root;
	}

	public void setRoot(U root) {
		this.root = root;
	}

	public int getDepth() {
		return depth;
	}

	public void setDepth(int depth) {
		this.depth = depth;
	}

	@Override
	public void run() {
		try {
			this.get(root);
		} catch (Exception e) {
			Thread.yield();
		}
	}

	@SuppressWarnings("deprecation")
	protected void get(U u) throws Exception {
		HttpClient client = new HttpClient();
		if (u.getDepth() + 1 > this.depth) {
			return;
		}
		HttpMethod get = new GetMethod(u.getUrl());
		get.setRequestHeader(USER_AGENT_H, USER_AGENT);
		int status = client.executeMethod(get);
		if (status == HttpStatus.SC_OK) {
			InputStream in = get.getResponseBodyAsStream();
			this.setHost(HTTP_PROTROL + get.getHostConfiguration().getHost());
			process(in);
		}
	}

	protected abstract void process(InputStream in) throws Exception;
}

代码片段

public class U {
	private int depth;
	private String url;

	public U(int depth, String url) {
		super();
		this.depth = depth;
		this.url = url;
	}

	public int getDepth() {
		return depth;
	}

	public void setDepth(int depth) {
		this.depth = depth;
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	@Override
	public String toString() {
		return "[深度:" + depth + ",URL:" + url + "]";
	}
}

代码片段

import java.io.InputStream;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class AchorFetch extends Fetch {

	public AchorFetch(String root, int depth) {
		super(root, depth);
		// TODO Auto-generated constructor stub
	}

	public AchorFetch(U root, int depth) {
		super(root, depth);
	}

	@Override
	protected void process(InputStream in) throws Exception {
		Document doc = Jsoup.parse(in, "UTF-8", "");
		Elements as = doc.select("a");
		for (Element a : as) {
			String href = a.absUrl("href");
			if (StringUtils.isBlank(href)) {
				href = a.attr("href");
				href = this.getHost()
						+ (href.startsWith("/") ? href : "/" + href);
			}
			if (!StringUtils.isBlank(href)) {
				String line = "[线程:" + this.getRoot().getUrl() + "][总深度:"
						+ this.getDepth() + "][当前深度:"
						+ this.getRoot().getDepth() + "][URL:" + href + "]";
				U u = new U(this.getRoot().getDepth() + 1, href);
				System.out.println(line);
				new Thread(new AchorFetch(u, this.getDepth())).start();
			}
		}
	}
}

代码片段

public static void main(String[] args) {
		try {
			AchorFetch a = new AchorFetch("http://www.sina.com", 2);
			new Thread(a).start();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

转载注明地址http://www.chengxuyuans.com/code/java/61604.html