大战熟女丰满人妻av-荡女精品导航-岛国aaaa级午夜福利片-岛国av动作片在线观看-岛国av无码免费无禁网站-岛国大片激情做爰视频

專注Java教育14年 全國咨詢/投訴熱線:400-8080-105
動力節點LOGO圖
始于2009,口口相傳的Java黃埔軍校
首頁 hot資訊 封裝Java爬蟲工具類

封裝Java爬蟲工具類

更新時間:2021-09-22 11:12:00 來源:動力節點 瀏覽1027次

封裝了一個JAVA爬蟲工具類。

1.maven引用jar

   <dependency>
		<groupId>net.sourceforge.htmlunit</groupId>
		<artifactId>htmlunit</artifactId>
		<version>2.27</version>
	</dependency>
	<dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.8.3</version>
	</dependency>    

2.工具類

  public class HttpHtmlUnit {
	/**
	 * 請求超時時間,默認20000ms
	 */
	private int timeout = 20000;
	/**
	 * 等待異步JS執行時間,默認20000ms
	 */
	private int waitForBackgroundJavaScript = 20000;
	/**
	 * cookie表
	 */
	private Map<String, String> cookieMap = new HashMap<>();
/**
 * 請求編碼(處理返回結果),默認UTF-8
 */
private String charset = "UTF-8";
private static HttpHtmlUnit httpUtils;
private HttpHtmlUnit() {
}
/**
 * 獲取實例
 *
 * @return
 */
public static HttpHtmlUnit getInstance() {
	if (httpUtils == null)
		httpUtils = new HttpHtmlUnit();
	return httpUtils;
}
/**
 * 清空cookieMap
 */
public void invalidCookieMap() {
	cookieMap.clear();
}
public int getTimeout() {
	return timeout;
}
/**
 * 設置請求超時時間
 *
 * @param timeout
 */
public void setTimeout(int timeout) {
	this.timeout = timeout;
}
public String getCharset() {
	return charset;
}
/**
 * 設置請求字符編碼集
 *
 * @param charset
 */
public void setCharset(String charset) {
	this.charset = charset;
}
public int getWaitForBackgroundJavaScript() {
	return waitForBackgroundJavaScript;
}
/**
 * 設置獲取完整HTML頁面時等待異步JS執行的時間
 *
 * @param waitForBackgroundJavaScript
 */
public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
	this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
}
/**
 * 將網頁返回為解析后的文檔格式
 * 
 * @param html
 * @return
 * @throws Exception
 */
public static Document parseHtmlToDoc(String html) throws Exception {
	return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
	Document doc = Jsoup.parse(str);
	String result = doc.html().replace("&nbsp;", "");
	return Jsoup.parse(result);
}
/**
 * 執行get請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGet(url));
}
/**
 * 執行get請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGet(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = HttpClientBuilder.create().build();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 用https執行get請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetWithSSLAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGetWithSSL(url));
}
/**
 * 用https執行get請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGetWithSSL(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = createSSLInsecureClient();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} catch (GeneralSecurityException ex) {
		throw ex;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 執行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePost(url, params));
}
/**
 * 執行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePost(String url, Map<String, String> params) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 用https執行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePostWithSSL(url, params));
}
/**
 * 用https執行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePostWithSSL(String url, Map<String, String> params) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new UrlEncodedFormEntity(paramsRe));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 發送JSON格式body的POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJson(String url, String jsonBody) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 發送JSON格式body的SSL POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 獲取頁面文檔字串(等待異步JS執行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public String getHtmlPageResponse(String url) throws Exception {
	String result = "";
	final WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執行出錯的時候是否拋出異常
	webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀態非200時是否拋出異常
	webClient.getOptions().setActiveXNative(true);
	webClient.getOptions().setCssEnabled(true);//是否啟用CSS
	webClient.getOptions().setJavaScriptEnabled(true); //很重要,啟用JS
	webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,設置支持AJAX
	webClient.getOptions().setTimeout(timeout);//設置“瀏覽器”的請求超時時間
	webClient.setJavaScriptTimeout(timeout);//設置JS執行的超時時間
	HtmlPage page;
	try {
		page = webClient.getPage(url);
	} catch (Exception e) {
		webClient.close();
		throw e;
	}
	webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//該方法阻塞線程
	result = page.asXml();
	webClient.close();
	return result;
}
/**
 * 獲取頁面文檔Document對象(等待異步JS執行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
	return parseHtmlToDoc(getHtmlPageResponse(url));
}
private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {
	List<Cookie> cookies = cookieStore.getCookies();
	for (Cookie cookie : cookies) {
		cookieMap.put(cookie.getName(), cookie.getValue());
	}
}
private String convertCookieMapToString(Map<String, String> map) {
	String cookie = "";
	for (Map.Entry<String, String> entry : map.entrySet()) {
		cookie += (entry.getKey() + "=" + entry.getValue() + "; ");
	}
	if (map.size() > 0) {
		cookie = cookie.substring(0, cookie.length() - 2);
	}
	return cookie;
}
/**
 * 創建 SSL連接
 *
 * @return
 * @throws GeneralSecurityException
 */
private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
	try {
		SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
		SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
				(s, sslContextL) -> true);
		return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
	} catch (GeneralSecurityException e) {
		throw e;
	}
}

3.遇到的問題:

htmlunit引用common-io的版本較低。如果項目中其它地方有應用common-io較高版本,版本沖突會導致問題。處理版本沖突可參照maven引用依賴原則,pom文件中放置在較前位置的版本先被引用到處理。

Java開發工具有很多,大家以后可以慢慢了解,有些工具是比較常用的,大家可要掌握哦。

提交申請后,顧問老師會電話與您溝通安排學習

免費課程推薦 >>
技術文檔推薦 >>
主站蜘蛛池模板: 国产成人毛片亚洲精品不卡 | 色资源在线观看 | 久久综合九色 | 国产精品久久久久无码av | 在线看日本a毛片 | 九九热在线视频 | 一区二区三区四区 | 日韩欧美国产中文字幕 | 日韩欧美一区二区三区不卡 | 久久精品一区二区三区不卡 | 日本岛国片在线观看 | 伊人热久久 | 偷偷鲁国内视频视频在线 | 欧美人在线一区二区三区 | 在线亚洲欧美性天天影院 | 久久久久久影院 | 日本爱爱网站 | 精品国产综合成人亚洲区 | 亚洲第一视频在线播放 | 久久er国产精品免费观看1 | 黄在线免费看 | xxx中国www免费 | 久久99精品这里精品动漫6 | 成人久久18免费游戏网站 | 无遮挡又黄又爽又色的视频免费 | 日日夜夜天天 | 欧美激情视频网址 | 亚洲小younv另类 | 日本不卡视频在线视频观看 | 92精品国产自产在线观看 | 久久久综合九色合综国产 | 日本在线不卡视频 | 欧美另类videos粗暴黑人 | 一级成人生活片免费看 | 在线观看一级 | 在线观看一级毛片 | 国产精品123区 | 久久天天躁综合夜夜黑人鲁色 | 男女交黄 | 成人在线免费视频 | 久久在线中文字幕 |