Jsoup操作HTML
依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.3</version>
</dependency>
解析HTML
String html = """
<html>
<head>
<title>标题a</title>
</head>
<body>
<p>abcdefg</p>
<p>1234567</p>
</body>
</html>
""";
Document document = Jsoup.parse(html);
Elements elements = document.getElementsByTag("title");
System.out.println(elements.text());
Elements e1 = document.getElementsByTag("p");
System.out.println(e1.text());
爬取网页内容
//加载网页
Connection connect = Jsoup.connect("http://www.dingqinan.com/");
Document document = connect.get();
// System.out.println(document);
//获取网页title
System.out.println(document.title());
//获取所有链接
Elements links = document.select("a[href]");
for (Element link : links) {
//标题
System.out.println(link.text());
//链接
System.out.println(link.attr("href"));
}
//获取所有图片
Elements imgs = document.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
for (Element img : imgs) {
System.out.println(img.attr("data-src"));
}
修改HTML元素
String html = """
<html>
<head>
<title>标题a</title>
</head>
<body>
<p>abcdefg</p>
<p class="xxhh">1234567</p>
</body>
</html>
""";
Document document = Jsoup.parse(html);
Elements elements = document.select("p.xxhh");
elements.first().text("1111");
System.out.println(document);
防止XSS攻击
String a = "<p><a href='http://www.dqn.com' onclick='getCookies()'>提交</a></p>";
String s = Jsoup.clean(a, Safelist.basic());
System.out.println(s);
输出结果中的onclick事件被清楚了