使用apache-commonsHttpclient+htmlparser+jericho-html登陆web并解析html

package web;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.FormFields;
import au.id.jericho.lib.html.Source;

public class HttpClientLogin {

public static void main(String args[]) throws HttpException, IOException,ParserException {
HttpClient client = new HttpClient();
client.getParams().setContentCharset(“utf-8”);
client.getHostConfiguration().setHost(“127.0.0.1”, 80, “http”);
client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

GetMethod authget = new GetMethod(“/admin/login.php”);
client.executeMethod(authget);
authget.releaseConnection();

PostMethod authpost = new PostMethod(“/admin/login.php”);
NameValuePair action = new NameValuePair(“action”,
“http://127.0.0.1/admin/login.php”);
NameValuePair method = new NameValuePair(“method”, “post”);
NameValuePair userid = new NameValuePair(“TextBox1”, “user”);
NameValuePair password = new NameValuePair(“TextBox2”, “pwd”);

authpost.setRequestBody(new NameValuePair[] { method, action, userid,password, });
client.executeMethod(authpost);
authpost.releaseConnection();

String anotherPage = “/admin/index.php”;
GetMethod anotherPageGet = new GetMethod(anotherPage);
client.executeMethod(anotherPageGet);

Source source = new Source(anotherPageGet.getResponseBodyAsStream());

source.fullSequentialParse();
FormFields formFields = source.findFormFields();

String temp = “”;
for (Iterator i = formFields.iterator(); i.hasNext();) {
FormField formField = (FormField) i.next();
if (formField.getName() != null
&& formField.getName().equals(“temp”)) {
temp = (String) formField.getFormControl().getAttributesMap()
.get(“value”);
}
}// 抓取 temp filed 当访问该page产生的临时值

NameValuePair temp_ = new NameValuePair(“temp”, temp);

PostMethod anotherPagePost = new PostMethod(anotherPage);// 提交
anotherPagePost.setRequestBody(new NameValuePair[] { temp_ });// 加入该查询条件

client.executeMethod(anotherPagePost);// 提交查询条件

String resultHtml = anotherPagePost.getResponseBodyAsString();
Parser parser = Parser.createParser(new String(resultHtml.getBytes(),”8859_1″), “8859-1”);

String filterStr = “table”;
NodeFilter filter = new TagNameFilter(filterStr);
NodeList nodes = parser.extractAllNodesThatMatch(filter);// 抓取查询结果
Node node = nodes.elementAt(3);// 截取talbe

String result = new String(node.toHtml().getBytes(“8859_1”));
}

}

原创文章,转载请注明: 转载自海波无痕

本文链接地址: 使用apache-commonsHttpclient+htmlparser+jericho-html登陆web并解析html

文章的脚注信息由WordPress的wp-posturl插件自动生成

此条目发表在javaee分类目录,贴了, , , , 标签。将固定链接加入收藏夹。

发表评论

电子邮件地址不会被公开。 必填项已用*标注

评论链接可以 移除 nofollow.