2011.06.29——— Jsoup HttpClient 抓取网络上的图片
参考:
http://www.iteye.com/topic/1106648
http://www.ibm.com/developerworks/cn/java/j-lo-jsouphtml/index.html?ca=drs-
jsoup 官方网站:
http://jsoup.org
需要的主要jar包
httpclient-4.0.1jar jsoup-1.5.2.jar
主要代码 如下
Exmaple3.java
package com.th.spider.test;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Exmaple3 {
private static final Log log = LogFactory.getLog(Exmaple3.class);
/**
* 抓取图片存放目录
*/
private static final String PIC_DIR = "/home/li/pic";
/**
* 链接超时
*/
private static final int TIME_OUT = 5000;
static void go3(String url) throws Exception {
Connection conn= Jsoup.connect(url);
Document doc = conn.get();
Elements links = doc.select("div.piclist img[src]");
for(int i=0;i<links.size();i++){
Element element = links.get(i);
final String imgUrl = element.attr("src");
log.info(imgUrl);
Thread.sleep(500);
new Thread(new Runnable() {
public void run() {
try {
save(imgUrl);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}).start();
}
}
static void go2(String url) throws Exception {
Connection conn= Jsoup.connect(url);
Document doc = conn.get();
Elements links = doc.select("div.cc a[href]");
for(int i=0;i<links.size();i++){
Element element = links.get(i);
final String dirUrl = "http://www.3lian.com"+element.attr("href");
log.info(dirUrl);
Thread.sleep(500);
new Thread(new Runnable() {
public void run() {
try {
Connection conn= Jsoup.connect(dirUrl);
Document doc = conn.get();
Elements images = doc.select("div.mb_jjnr img[src]");
for(int j=0;j<images.size();j++){
Element img = images.get(j);
String imgUrl = img.attr("src");
log.info(imgUrl);
save(imgUrl);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}).start();
}
}
/**
* 处理帖子URL
* @param url
* @throws Exception
*/
static void go(String url) throws Exception {
// JSOP创建链接
Connection conn = Jsoup.connect(url);
// 请求返回整个文档对象
Document doc = conn.post();
// 选择所有class=zoom 的img标签对象
Elements imgs = doc.select("img[class=zoom]");
// 循环每个img标签
for (int i = 0; i < imgs.size(); i++) {
Element img = imgs.get(i);
// 取得图片的下载地址
String picURL = doc.baseUri() + img.attr("file");
log.info(picURL);
// 保存图片
save(picURL);
}
}
//<img src="static/image/common/none.gif" file="data/attachment/forum/201105/08/174412nz3jq4z90s33s2t0.jpg" width="770" class="zoom" onclick="zoom(this, this.src)" id="aimg_180565" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="img_src_29620.jpg" title="img_src_29620.jpg" />
//doc.select("img[class=zoom]")
/**
* 保存图片
* @param url
* @param i
* @throws Exception
*/
static void save(String url) throws Exception {
String fileName = url.substring(url.lastIndexOf("/"));
String filePath = PIC_DIR + "/" + fileName;
BufferedOutputStream out = null;
byte[] bit = getByte(url);
if (bit.length > 0) {
try {
out = new BufferedOutputStream(new FileOutputStream(filePath));
out.write(bit);
out.flush();
log.info("Create File success! [" + filePath + "]");
} finally {
if (out != null)
out.close();
}
}
}
/**
* 获取图片字节流
* @param uri
* @return
* @throws Exception
*/
static byte[] getByte(String uri) throws Exception {
HttpClient client = new DefaultHttpClient();
client.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);
HttpGet get = new HttpGet(uri);
get.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);
try {
HttpResponse resonse = client.execute(get);
if (resonse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
HttpEntity entity = resonse.getEntity();
if (entity != null) {
return EntityUtils.toByteArray(entity);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
client.getConnectionManager().shutdown();
}
return new byte[0];
}
public static void main(String[] args) throws Exception {
// 开始抓取图片
go2("http://www.3lian.com/gif/more/03/0301.html");
//go3("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969/");
}
}
分享到:
相关推荐
jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jar,jsoup-1.15.3.jarjsoup-...
Android开发一大神器——Jsoup.pdf
基于SSM+maven+httpClient+jsoup实现小说网站项目源码.zip 基于SSM+maven+httpClient+jsoup实现小说网站项目源码.zip 基于SSM+maven+httpClient+jsoup实现小说网站项目源码.zip 基于SSM+maven+httpClient+jsoup实现...
Android实战——jsoup实现网络爬虫,糗事百科项目的起步
基于SSM+maven+httpClient+jsoup实现小说网站项目.zip基于SSM+maven+httpClient+jsoup实现小说网站项目.zip基于SSM+maven+httpClient+jsoup实现小说网站项目.zip基于SSM+maven+httpClient+jsoup实现小说网站项目.zip...
资源名字:基于java+Jsoup+HttpClient的网络爬虫技术的网络新闻分析系统设计与实现(源码+文档)_MySQL_网络爬虫_数据挖掘.zip 资源内容:项目全套源码+完整文档 源码说明: 全部项目源码都是经过测试校正后百分百...
赠送jar包:jsoup-1.14.3.jar; 赠送原API文档:jsoup-1.14.3-javadoc.jar; 赠送源代码:jsoup-1.14.3-sources.jar; 赠送Maven依赖信息文件:jsoup-1.14.3.pom; 包含翻译后的API文档:jsoup-1.14.3-javadoc-API...
Jsoup+httpclient模拟登陆和抓取页面.pdf
org.jsoup.zip
jsoup.jar下载网络爬虫java,java工具类
Jsoup+httpclient 模拟登陆和抓取页面 package com.app.html; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader...
java爬虫技术所需要的jar包,里面有jsuop技术相关的jar:chardet.jar、commons-lang.jar、commons-logging.jar、cpdetector.jar、httpclient-4.2.5.jar、httpcore-4.2.4.jar、jsoup-1.7.2.jar;;附加 htmlparser ...
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
httpClient+jsoup抓取网页数据实例和jar包
jsoup1.8.1抓取爬虫工具jsoup1.8.1抓取爬虫工具jsoup1.8.1抓取爬虫工具
使用jsoup异步抓取新闻数据装载listview(仿开源中国资讯列表.rar,太多无法一一验证是否可用,程序如果跑不起来需要自调,部分代码功能进行参考学习。
commons-beanutils-1.8.3 commons -codec-1.6 commons -collections-3.2.1 commons- lang-2.5.jar commons- longging-1.13.jar4.3.3.jar ...jsoup_1.6.2_add.jar mysql-connector-java.5.1.7.jar sqljdbc4.jar
包含jsoup-1.7.3.jar,jsoup-1.7.3-javadoc.jar,jsoup-1.7.3-sources.jar,com.springsource.org.apache.commons.httpclient-3.1.0.jar,org.apache.commons.httpclient.jar
主要介绍了Java爬虫Jsoup+httpclient获取动态生成的数据的相关资料,需要的朋友可以参考下
import org.jsoup.Jsoup